aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorMichael Tyler <michael.tyler@arm.com>2023-04-12 17:43:17 +0100
committermichael.tyler <michael.tyler@arm.com>2023-06-05 15:57:58 +0000
commit74921eee924625426429044decefe3673561b174 (patch)
tree654da1a95e3d42d6af8ad1ff27bb40d77b1fd8c5 /src/core/NEON/kernels
parentdf5d9878008be9b60586df97ebfff197abb5195e (diff)
downloadComputeLibrary-74921eee924625426429044decefe3673561b174.tar.gz
Update CPU kernel implementations and guard directives
Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler <michael.tyler@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/arm_conv/addressing.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp90
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp178
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp178
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp40
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp273
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp350
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp567
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1290
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp969
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1848
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp293
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp462
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp661
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp808
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp356
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1510
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp279
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp354
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp573
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1072
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp972
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp299
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp466
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp10
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp661
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp810
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp216
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp542
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp1281
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1196
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2738
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1798
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp11
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp532
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp11
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2496
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2738
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1798
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp532
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1856
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1786
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3632
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1798
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp12
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp7
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp252
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp466
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp648
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp710
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1000
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp184
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp286
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp394
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp560
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp1132
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp1208
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp698
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp1356
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp1288
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp1570
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp1434
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1570
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1434
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp582
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1570
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1434
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp252
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp514
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp734
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp884
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1188
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp270
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp330
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp642
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp838
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp252
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp514
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp734
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp884
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1188
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp270
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp330
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp642
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp838
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp28
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp304
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp554
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp744
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp880
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1028
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp338
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp426
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp756
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp880
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1028
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp338
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp424
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1028
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp275
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp53
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp274
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp239
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp56
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp239
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp56
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp359
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp488
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp56
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp359
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp303
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp515
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp38
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp155
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp153
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp38
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp153
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp151
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp157
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp151
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp161
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp221
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp165
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp54
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp151
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp177
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp231
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp126
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp136
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp208
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp146
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp156
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp382
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp312
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp256
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp28
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp1
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp566
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp1
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp1
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp107
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp195
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp107
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp87
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp195
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp126
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp209
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp152
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp126
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp209
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp152
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp125
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp81
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp145
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp189
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp294
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp294
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp273
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp158
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp158
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp176
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp176
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp102
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp102
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp152
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp22
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp1876
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp3020
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp1744
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp600
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp146
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp228
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp144
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp222
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp1962
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp1880
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp19
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp4077
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp3021
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp1628
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp1120
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp2416
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp1746
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp2786
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp338
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp602
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp942
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp2632
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp1612
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp1616
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp4330
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp3620
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp3560
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp2462
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp1702
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp1704
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp2632
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp1612
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp1616
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp2462
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp1702
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp1704
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp121
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp201
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp201
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp201
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp201
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp201
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp201
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp541
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp541
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp679
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp751
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp751
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp248
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp252
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp300
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp248
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp252
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp300
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp298
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp296
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp326
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp202
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp216
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp264
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp298
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp296
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp326
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp1934
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp678
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp4010
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp678
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp2282
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp1032
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp288
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp194
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp146
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp194
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp2283
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp1936
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp679
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp4011
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp679
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp2283
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp475
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp1067
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp1033
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp1473
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp1663
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp1513
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp3271
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp2829
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp31
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp467
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp2071
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp1809
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp1663
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp1513
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp31
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp467
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp2071
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp1809
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp122
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp258
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp22
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp182
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp28
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp182
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp34
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp182
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp22
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp258
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp34
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp182
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp258
-rw-r--r--src/core/NEON/kernels/arm_gemm/misc.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantized.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transform.cpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp208
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp143
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp132
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp5
-rw-r--r--src/core/NEON/kernels/assembly/winograd.hpp4
-rw-r--r--src/core/NEON/kernels/convolution/winograd/input_transform.hpp9
-rw-r--r--src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp8
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transform.hpp4
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp8
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp8
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp8
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp5
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp10
-rw-r--r--src/core/NEON/kernels/convolution/winograd/padding.cpp191
-rw-r--r--src/core/NEON/kernels/convolution/winograd/weight_transform.hpp4
-rw-r--r--src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp5
648 files changed, 127783 insertions, 128976 deletions
diff --git a/src/core/NEON/kernels/arm_conv/addressing.cpp b/src/core/NEON/kernels/arm_conv/addressing.cpp
index d01627bc5a..2460398880 100644
--- a/src/core/NEON/kernels/arm_conv/addressing.cpp
+++ b/src/core/NEON/kernels/arm_conv/addressing.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,8 @@
*/
#include "addressing.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
+#include <algorithm>
#include <cstring>
namespace arm_conv {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index c305835107..b6f45c6825 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -24,8 +24,8 @@
#pragma once
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "depthwise.hpp"
+#include "utils.hpp"
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
index c2b861000c..2950d5e957 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -10,8 +10,8 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
#include "depthwise_common.hpp"
+#include "utils.hpp"
+
using arm_gemm::iceildiv;
namespace arm_conv {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
index cef568fadd..3d305b6d18 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -27,10 +27,6 @@
#include "depthwise_depthfirst.hpp"
#include "interleaves/generic_quantized_dot_product.hpp"
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
#include <limits>
namespace arm_conv {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
index 350e93b874..134dbd1b4c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -77,6 +77,18 @@ namespace
);
}
+ template <class Strategy>
+ unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ args.output_cols *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
index 09ee983907..382ccd3c62 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -103,6 +103,18 @@ namespace
);
}
+ template <class Strategy>
+ unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ ) * 2 / 3;
+ }
+
#if defined(__aarch64__)
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
index 1ba7694f1e..15064aeedc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
#pragma once
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "depthwise.hpp"
namespace arm_conv
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
index 2b2e6f3555..567eab13f3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#pragma once
+
#include "depthfirst_driver.hpp"
#include "interleaves/generic.hpp"
@@ -52,7 +54,7 @@ struct PlanarKernelType;
template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
{
- using Type = std::function<void(
+ typedef void (*Type)(
const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
unsigned int pad_top, unsigned int valid_input_rows,
unsigned int pad_left, unsigned int valid_input_cols,
@@ -60,7 +62,7 @@ struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
TOutput **, const size_t *, const size_t *, unsigned int output_cols,
unsigned int start_channels, unsigned int valid_channels,
TAccum act_min, TAccum act_max
- )>;
+ );
template <typename WorkspaceType>
static inline void execute(
@@ -89,7 +91,7 @@ struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
template <typename TInput, typename TWeight, typename TOutput>
struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
{
- using Type = std::function<void(
+ typedef void (*Type)(
const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
unsigned int pad_top, unsigned int valid_input_rows,
unsigned int pad_left, unsigned int valid_input_cols,
@@ -97,7 +99,7 @@ struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize3
TOutput **, const size_t *, const size_t *, unsigned int output_cols,
unsigned int start_channel, unsigned int valid_channels,
const arm_gemm::Requantize32 &
- )>;
+ );
template <typename WorkspaceType>
static inline void execute(
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
index 99b91fb833..39f60c362b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "interleaves/generic.hpp"
#include "depthfirst_driver.hpp"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
deleted file mode 100644
index d59d6b7e35..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "8b_mla.hpp"
-
-size_t generic_get_packed_size(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_input_channels
-)
-{
- const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
- return arm_gemm::roundup((long unsigned int) n_input_channels, per_iter) * kernel_rows * kernel_cols * sizeof(int8_t);
-}
-
-void generic_pack(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_channels,
- void *_outptr,
- const void *_weights,
- size_t ld_weight_col,
- size_t ld_weight_row
-)
-{
- int8_t *outptr = reinterpret_cast<int8_t *>(_outptr);
- const int8_t *weights = reinterpret_cast<const int8_t *>(_weights);
-
- // Get the strides
- ld_weight_col = (ld_weight_col == 0) ? n_channels * sizeof(int8_t) : ld_weight_col;
- ld_weight_row = (ld_weight_row == 0) ? kernel_cols * ld_weight_col : ld_weight_row;
-
- // Pack into per-iter chunks.
- const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
- for (unsigned int c = 0; c < n_channels; c += per_iter)
- {
- auto weight_row = weights + c;
- const auto to_copy = std::min<unsigned int>(per_iter, n_channels - c);
-
- for (unsigned int i = 0; i < kernel_rows; i++)
- {
- auto weight_col = weight_row;
-
- for (unsigned int j = 0; j < kernel_cols; j++)
- {
- memcpy(outptr, weight_col, to_copy);
- outptr += per_iter;
- weight_col += ld_weight_col;
- }
-
- weight_row += ld_weight_row;
- }
- }
-}
-
-namespace arm_conv {
-namespace depthwise {
-
-ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 3, 3)
-ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 5, 5)
-ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 3, 3)
-ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 5, 5)
-
-} // namespace depthwise
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp
deleted file mode 100644
index 3176d1dedd..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
-#include <cstdint>
-#include <cstring>
-
-using namespace arm_gemm;
-
-size_t generic_get_packed_size(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_input_channels
-);
-
-void generic_pack(
- const VLType vec_type,
- const unsigned int acc_depth,
- const unsigned int kernel_rows,
- const unsigned int kernel_cols,
- const unsigned int n_channels,
- void *_outptr,
- const void *_weights,
- size_t ld_weight_col,
- size_t ld_weight_row
-);
-
-#define ADD_IMPLEMENTATION(ARCH, TYPENAME, TYPE, VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS) \
-struct interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla \
-{ \
- static size_t get_packed_size(const DepthwiseArgs &args); \
- static void pack_parameters( \
- unsigned int n_channels, void *outptr, \
- const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row \
- ); \
-}; \
-\
-size_t interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::get_packed_size(const DepthwiseArgs &args) \
-{ \
- return generic_get_packed_size(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, args.input_channels); \
-} \
-\
-void interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::pack_parameters(unsigned int n_channels, void *outptr, \
- const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row) \
-{ \
- generic_pack(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, n_channels, outptr, weights, ld_weight_col, ld_weight_row); \
-}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
index adda78f164..5e4bf99120 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(__aarch64__)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -54,162 +54,162 @@ void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cmp %x[ld_weight_col], XZR\n"
"csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
"movi v16.4s, #0x9\n"
- "movi v0.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"mov x21, #0x3\n"
"mul x21, %x[ld_weight_col], x21\n"
"add x20, %x[qp], %[offsetof_input_offset]\n"
- "ld1r { v31.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_weights_offset]\n"
"ld1r { v30.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"cmp %x[ld_weight_row], XZR\n"
- "mul v30.4s, v30.4s, v31.4s\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
"csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
"lsr x21, %x[n_channels], #0x2\n"
- "movi v29.16b, #0x1\n"
- "mul v30.4s, v30.4s, v16.4s\n"
+ "movi v28.16b, #0x1\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
"add x25, %x[weights], %x[ld_weight_row]\n"
"add x20, %x[qp], %[offsetof_per_layer_mul]\n"
- "ld1r { v28.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
"ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x24, x25, %x[ld_weight_row]\n"
"add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
"mov x22, #0x0\n"
"cbz x21, 4f\n"
"1:" // Loop
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q26, [%x[bias], x22]\n"
+ "ldr q25, [%x[bias], x22]\n"
"2:" // Loop: Skip bias load
- "ldr s25, [%x[weights], #0x0]\n"
- "ldr s22, [%x[weights], %x[ld_weight_col]]\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- "ldr s20, [%x[weights], x23]\n"
- "ldr s23, [x25, #0x0]\n"
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "ldr s21, [x25, %x[ld_weight_col]]\n"
- "ldr s18, [x25, x23]\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
+ "ldr s19, [%x[weights], #0x0]\n"
+ "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v17.16b, v16.16b, v31.16b\n"
+ "movi v21.4s, #0x0\n"
+ "ldr s16, [%x[weights], x23]\n"
+ "ldr s18, [x25, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v20.16b, v16.16b, v17.16b\n"
+ "ldr s17, [x25, %x[ld_weight_col]]\n"
+ "ldr s16, [x25, x23]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v31.16b\n"
"ldr s17, [x24, #0x0]\n"
"ldr s19, [x24, %x[ld_weight_col]]\n"
- ".inst 0x4e9697b8 // sdot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
+ ".inst 0x4e949795 // sdot v21.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
"ldr s16, [x24, x23]\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
- ".inst 0x4e9297b8 // sdot v24.4s, v29.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ ".inst 0x4e929795 // sdot v21.4s, v28.16b, v18.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x4e9097b8 // sdot v24.4s, v29.16b, v16.16b\n"
+ ".inst 0x4e909795 // sdot v21.4s, v28.16b, v16.16b\n"
"add %x[weights], %x[weights], #0x4\n"
"add x25, x25, #0x4\n"
- "mls v26.4s, v24.4s, v31.4s\n"
+ "mls v25.4s, v21.4s, v30.4s\n"
"add x24, x24, #0x4\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ldr q28, [%x[rq_mul_perchannel], x22]\n"
- "ldr q27, [%x[rq_shift_perchannel], x22]\n"
+ "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+ "ldr q26, [%x[rq_shift_perchannel], x22]\n"
"3:" // Loop: Quantisation parameters: Store
"subs x21, x21, #0x1\n"
- "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x0]\n"
"add x22, x22, #0x10\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"bgt 1b\n"
"tst %x[n_channels], #0x3\n"
"beq 13f\n"
"4:" // Oddments
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 7f\n"
"add %x[bias], %x[bias], x22\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+ "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
"b 6f\n"
"5:" // Oddments: Load bias: Bit 1: Unset
- "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
"6:" // Oddments: Load bias: Bit 1: End
"7:" // Oddments: Skip bias load
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v25.h }[0], [%x[weights]]\n"
- "ld1 { v23.h }[0], [x25]\n"
+ "ld1 { v17.h }[0], [%x[weights]]\n"
+ "ld1 { v24.h }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.h }[0], [x21]\n"
- "ld1 { v20.h }[0], [x20]\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v16.h }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.h }[0], [x21]\n"
+ "ld1 { v19.h }[0], [x21]\n"
"ld1 { v18.h }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.h }[0], [x24]\n"
- "ld1 { v19.h }[0], [x21]\n"
+ "ld1 { v23.h }[0], [x24]\n"
+ "ld1 { v22.h }[0], [x21]\n"
"add %x[weights], %x[weights], #0x2\n"
"add x25, x25, #0x2\n"
- "ld1 { v16.h }[0], [x20]\n"
+ "ld1 { v21.h }[0], [x20]\n"
"add x24, x24, #0x2\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.b }[2], [%x[weights]]\n"
- "ld1 { v23.b }[2], [x25]\n"
+ "ld1 { v17.b }[2], [%x[weights]]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[2], [x21]\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[2], [x21]\n"
+ "ld1 { v19.b }[2], [x21]\n"
"ld1 { v18.b }[2], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[2], [x24]\n"
- "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 9f\n"
"8:" // Oddments: Load weights: Bit 1: Unset
- "ld1 { v25.b }[0], [%x[weights]]\n"
- "ld1 { v23.b }[0], [x25]\n"
+ "ld1 { v17.b }[0], [%x[weights]]\n"
+ "ld1 { v24.b }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[0], [x21]\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[0], [x21]\n"
+ "ld1 { v19.b }[0], [x21]\n"
"ld1 { v18.b }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[0], [x24]\n"
- "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"9:" // Oddments: Load weights: Bit 1: End
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x4e9697b8 // sdot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- ".inst 0x4e9297b8 // sdot v24.4s, v29.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
+ "zip1 v16.16b, v20.16b, v31.16b\n"
+ "zip1 v20.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v24.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e949793 // sdot v19.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v23.16b, v21.16b\n"
+ ".inst 0x4e929793 // sdot v19.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v31.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x4e9097b8 // sdot v24.4s, v29.16b, v16.16b\n"
- "mls v26.4s, v24.4s, v31.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ ".inst 0x4e909793 // sdot v19.4s, v28.16b, v16.16b\n"
+ "mls v25.4s, v19.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
@@ -217,24 +217,24 @@ void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"add x21, %x[rq_mul_perchannel], x22\n"
"add x20, %x[rq_shift_perchannel], x22\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v28.d }[0], [x21], #0x8\n"
- "ld1 { v27.d }[0], [x20], #0x8\n"
+ "ld1 { v27.d }[0], [x21], #0x8\n"
+ "ld1 { v26.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v27.s }[2], [x20], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
"12:" // Oddments: Quantisation parameters: Store
- "str q28, [%x[outptr], #0x0]\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q27, [%x[outptr], #0x0]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"13:" // End
: [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
index b89886ae0c..314f09a0c5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(__aarch64__)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -54,162 +54,162 @@ void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cmp %x[ld_weight_col], XZR\n"
"csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
"movi v16.4s, #0x9\n"
- "movi v0.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"mov x21, #0x3\n"
"mul x21, %x[ld_weight_col], x21\n"
"add x20, %x[qp], %[offsetof_input_offset]\n"
- "ld1r { v31.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_weights_offset]\n"
"ld1r { v30.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"cmp %x[ld_weight_row], XZR\n"
- "mul v30.4s, v30.4s, v31.4s\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
"csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
"lsr x21, %x[n_channels], #0x2\n"
- "movi v29.16b, #0x1\n"
- "mul v30.4s, v30.4s, v16.4s\n"
+ "movi v28.16b, #0x1\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
"add x25, %x[weights], %x[ld_weight_row]\n"
"add x20, %x[qp], %[offsetof_per_layer_mul]\n"
- "ld1r { v28.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
"ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x24, x25, %x[ld_weight_row]\n"
"add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
"mov x22, #0x0\n"
"cbz x21, 4f\n"
"1:" // Loop
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q26, [%x[bias], x22]\n"
+ "ldr q25, [%x[bias], x22]\n"
"2:" // Loop: Skip bias load
- "ldr s25, [%x[weights], #0x0]\n"
- "ldr s22, [%x[weights], %x[ld_weight_col]]\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- "ldr s20, [%x[weights], x23]\n"
- "ldr s23, [x25, #0x0]\n"
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "ldr s21, [x25, %x[ld_weight_col]]\n"
- "ldr s18, [x25, x23]\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
+ "ldr s19, [%x[weights], #0x0]\n"
+ "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v17.16b, v16.16b, v31.16b\n"
+ "movi v21.4s, #0x0\n"
+ "ldr s16, [%x[weights], x23]\n"
+ "ldr s18, [x25, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v20.16b, v16.16b, v17.16b\n"
+ "ldr s17, [x25, %x[ld_weight_col]]\n"
+ "ldr s16, [x25, x23]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v31.16b\n"
"ldr s17, [x24, #0x0]\n"
"ldr s19, [x24, %x[ld_weight_col]]\n"
- ".inst 0x6e9697b8 // udot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
+ ".inst 0x6e949795 // udot v21.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
"ldr s16, [x24, x23]\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
- ".inst 0x6e9297b8 // udot v24.4s, v29.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ ".inst 0x6e929795 // udot v21.4s, v28.16b, v18.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x6e9097b8 // udot v24.4s, v29.16b, v16.16b\n"
+ ".inst 0x6e909795 // udot v21.4s, v28.16b, v16.16b\n"
"add %x[weights], %x[weights], #0x4\n"
"add x25, x25, #0x4\n"
- "mls v26.4s, v24.4s, v31.4s\n"
+ "mls v25.4s, v21.4s, v30.4s\n"
"add x24, x24, #0x4\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ldr q28, [%x[rq_mul_perchannel], x22]\n"
- "ldr q27, [%x[rq_shift_perchannel], x22]\n"
+ "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+ "ldr q26, [%x[rq_shift_perchannel], x22]\n"
"3:" // Loop: Quantisation parameters: Store
"subs x21, x21, #0x1\n"
- "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x0]\n"
"add x22, x22, #0x10\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"bgt 1b\n"
"tst %x[n_channels], #0x3\n"
"beq 13f\n"
"4:" // Oddments
- "movi v26.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"cbz %x[bias], 7f\n"
"add %x[bias], %x[bias], x22\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+ "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
"b 6f\n"
"5:" // Oddments: Load bias: Bit 1: Unset
- "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+ "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
"6:" // Oddments: Load bias: Bit 1: End
"7:" // Oddments: Skip bias load
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v25.h }[0], [%x[weights]]\n"
- "ld1 { v23.h }[0], [x25]\n"
+ "ld1 { v17.h }[0], [%x[weights]]\n"
+ "ld1 { v24.h }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.h }[0], [x21]\n"
- "ld1 { v20.h }[0], [x20]\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v16.h }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.h }[0], [x21]\n"
+ "ld1 { v19.h }[0], [x21]\n"
"ld1 { v18.h }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.h }[0], [x24]\n"
- "ld1 { v19.h }[0], [x21]\n"
+ "ld1 { v23.h }[0], [x24]\n"
+ "ld1 { v22.h }[0], [x21]\n"
"add %x[weights], %x[weights], #0x2\n"
"add x25, x25, #0x2\n"
- "ld1 { v16.h }[0], [x20]\n"
+ "ld1 { v21.h }[0], [x20]\n"
"add x24, x24, #0x2\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.b }[2], [%x[weights]]\n"
- "ld1 { v23.b }[2], [x25]\n"
+ "ld1 { v17.b }[2], [%x[weights]]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[2], [x21]\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[2], [x21]\n"
+ "ld1 { v19.b }[2], [x21]\n"
"ld1 { v18.b }[2], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[2], [x24]\n"
- "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 9f\n"
"8:" // Oddments: Load weights: Bit 1: Unset
- "ld1 { v25.b }[0], [%x[weights]]\n"
- "ld1 { v23.b }[0], [x25]\n"
+ "ld1 { v17.b }[0], [%x[weights]]\n"
+ "ld1 { v24.b }[0], [x25]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
"add x20, %x[weights], x23\n"
- "ld1 { v22.b }[0], [x21]\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"add x21, x25, %x[ld_weight_col]\n"
"add x20, x25, x23\n"
- "ld1 { v21.b }[0], [x21]\n"
+ "ld1 { v19.b }[0], [x21]\n"
"ld1 { v18.b }[0], [x20]\n"
"add x21, x24, %x[ld_weight_col]\n"
"add x20, x24, x23\n"
- "ld1 { v17.b }[0], [x24]\n"
- "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"9:" // Oddments: Load weights: Bit 1: End
- "zip1 v20.16b, v25.16b, v20.16b\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "zip1 v22.16b, v20.16b, v22.16b\n"
- "zip1 v20.16b, v23.16b, v18.16b\n"
- "zip1 v18.16b, v21.16b, v0.16b\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x6e9697b8 // udot v24.4s, v29.16b, v22.16b\n"
- "zip1 v18.16b, v20.16b, v18.16b\n"
"zip1 v17.16b, v17.16b, v16.16b\n"
- ".inst 0x6e9297b8 // udot v24.4s, v29.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v0.16b\n"
+ "zip1 v16.16b, v20.16b, v31.16b\n"
+ "zip1 v20.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v24.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e949793 // udot v19.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v23.16b, v21.16b\n"
+ ".inst 0x6e929793 // udot v19.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v31.16b\n"
"zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x6e9097b8 // udot v24.4s, v29.16b, v16.16b\n"
- "mls v26.4s, v24.4s, v31.4s\n"
- "add v26.4s, v26.4s, v30.4s\n"
- "str q26, [%x[outptr], #0x0]\n"
- "str q22, [%x[outptr], #0x10]\n"
+ ".inst 0x6e909793 // udot v19.4s, v28.16b, v16.16b\n"
+ "mls v25.4s, v19.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
"str q18, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
@@ -217,24 +217,24 @@ void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"add x21, %x[rq_mul_perchannel], x22\n"
"add x20, %x[rq_shift_perchannel], x22\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v28.d }[0], [x21], #0x8\n"
- "ld1 { v27.d }[0], [x20], #0x8\n"
+ "ld1 { v27.d }[0], [x21], #0x8\n"
+ "ld1 { v26.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v27.s }[2], [x20], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
"12:" // Oddments: Quantisation parameters: Store
- "str q28, [%x[outptr], #0x0]\n"
- "str q27, [%x[outptr], #0x10]\n"
+ "str q27, [%x[outptr], #0x0]\n"
+ "str q26, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"13:" // End
: [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
index 5b5ae17806..756c50b98c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "depthwise.hpp"
#include <functional>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp
deleted file mode 100644
index de74ca5f43..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "8b_mla.hpp"
-
-namespace arm_conv {
-namespace depthwise {
-
-#if defined(__ARM_FEATURE_SVE)
-
-ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 3, 3)
-ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 5, 5)
-ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 3, 3)
-ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 5, 5)
-
-#endif // defined(__ARM_FEATURE_SVE)
-
-} // namespace depthwise
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
index 0cf8044733..3a4999296a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -76,7 +76,6 @@ void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cbz %x[bias], 1f\n"
"ptrue p8.s\n"
"1:" // No bias
-
"2:" // Loop
"cntp x20, p2, p1.s\n"
"whilelt p0.b, XZR, x20\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
index e5bc8198f8..7c5d3c4904 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -25,8 +25,8 @@
#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "arm_gemm.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
#include <cstdint>
namespace arm_conv {
@@ -76,7 +76,6 @@ void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
"cbz %x[bias], 1f\n"
"ptrue p8.s\n"
"1:" // No bias
-
"2:" // Loop
"cntp x20, p2, p1.s\n"
"whilelt p0.b, XZR, x20\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index d2db12535f..6beaba841f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index f4027df375..d8ca3d7437 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -116,9 +116,9 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"add x28, x9, x25, LSL #1\n"
"add x12, x12, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x27, x28, x25, LSL #1\n"
"add x26, x11, x15\n"
"add x25, x12, x24, LSL #1\n"
@@ -126,7 +126,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x10, #0x20]\n"
@@ -145,162 +145,162 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr q13, [x28, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
"add x23, x23, #0x10\n"
"cmp x23, x22, LSL #4\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x27]\n"
- "ldr q16, [x10, #0x0]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ld1 { v18.8h }, [x27]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
"add x13, x13, #0x10\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
"ldr q4, [x10, #0x50]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ld1 { v9.8h }, [x28]\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x28]\n"
"ldr q1, [x10, #0x20]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
"ldr q0, [x10, #0x10]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
"ldr q2, [x10, #0x30]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
"ldr q13, [x28, x15]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x27, x15]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x27, x15]\n"
"ldr q3, [x10, #0x40]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x11]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x11]\n"
"ldr q5, [x10, #0x60]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
"ldr q11, [x13, x26]\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
"ldr q9, [x9, x15]\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
"ld1 { v10.8h }, [x13]\n"
"ldr q6, [x10, #0x70]\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
"ldr q12, [x9, x11]\n"
"ldr q7, [x10, #0x80]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
"ldr q8, [x10, #0x90]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
"add x27, x27, #0x10\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x12]\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
"add x10, x10, #0xa0\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x12, x14]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.8h }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x27]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ld1 { v18.8h }, [x27]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
"add x13, x13, #0x10\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ld1 { v9.8h }, [x28]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x28]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x27, x15]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x11]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x27, x15]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
"add x27, x27, #0x10\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x12]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x12, x14]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.8h }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 57f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"add x24, x9, x15\n"
"add x23, x13, XZR\n"
@@ -363,11 +363,11 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h12, [x21, #0x0]\n"
"ldr h13, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
"add x20, x27, XZR\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v12.8h\n"
@@ -630,14 +630,14 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
+ "fmin v29.8h, v29.8h, v26.8h\n"
+ "fmin v30.8h, v30.8h, v26.8h\n"
+ "fmin v31.8h, v31.8h, v26.8h\n"
"tbz %x[n_channels], #2, 54f\n"
"mov x21, x12\n"
"mov x20, x25\n"
@@ -699,7 +699,6 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"56:" // Tile loop: Oddments: Store: Bit 2: End
-
"57:" // Tile loop: End
"ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -714,7 +713,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index bea4715313..c9a554e9ad 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -83,16 +83,16 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x15, %x[n_channels], #0x3\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
"sub x27, XZR, x16\n"
"cbz x15, 3f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x16, x15, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -104,197 +104,197 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
"add x14, x14, #0xa0\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "ldr q9, [x26, x28]\n"
- "ldr q10, [x22, x28]\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr q11, [x25, x28]\n"
- "ldr q12, [x24, x28]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x28]\n"
- "bge 2f\n"
- "1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q9, [x22, x28]\n"
- "ldr q16, [x14, #0x0]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
"ldr q12, [x20, x28]\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
- "ldr x24, [x13, #0x58]\n"
- "ldr x23, [x13, #0x60]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x28]\n"
- "ldr x22, [x13, #0x68]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x28]\n"
- "ldr x21, [x13, #0x70]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr q25, [x14, #0x0]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr x22, [x13, #0x58]\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x22, x28]\n"
+ "ldr x26, [x13, #0x70]\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
"ldr q4, [x14, #0x50]\n"
- "ldr x20, [x13, #0x78]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q9, [x23, x28]\n"
+ "ldr x25, [x13, #0x78]\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x21, x28]\n"
"ldr q1, [x14, #0x20]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x22, x28]\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x16]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x16]\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x26, x28]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x25, x28]\n"
"ldr q3, [x14, #0x40]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
- "ldr q11, [x25, x16]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "ldr q11, [x22, x16]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
- "ldr q9, [x26, x16]\n"
- "ldr q10, [x22, x16]\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
- "ldr q12, [x24, x16]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "ldr q9, [x24, x16]\n"
+ "ldr q10, [x23, x16]\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q12, [x21, x16]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
"add x16, x16, #0x10\n"
"add x27, x27, #0x10\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
"cmp x16, x15, LSL #4\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
"add x28, x28, #0x10\n"
- "str q28, [x12, x27]\n"
+ "str q24, [x12, x27]\n"
"add x14, x14, #0xa0\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q9, [x22, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x21, x28]\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v30.8h, v6.8h, v9.8h\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr x24, [x13, #0x58]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v29.8h, v6.8h, v13.8h\n"
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
"ldr x23, [x13, #0x60]\n"
"ldr x22, [x13, #0x68]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x28]\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ldr q17, [x21, x28]\n"
"ldr x21, [x13, #0x70]\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x28]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0x78]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
"add x27, x27, #0x10\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q9, [x23, x28]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "fmla v30.8h, v7.8h, v11.8h\n"
- "fmla v31.8h, v6.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x23, x28]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x22, x28]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x21, x28]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
"add x28, x28, #0x10\n"
- "fmla v28.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v8.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v8.8h, v12.8h\n"
- "fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x12, x27]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "str q24, [x12, x27]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 56f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "mov x27, x28\n"
- "add x12, x12, x27\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x11, x11, x27\n"
- "add x10, x10, x27\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x9, x9, x27\n"
+ "add x9, x9, x20\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
@@ -357,12 +357,12 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v12.h }[0], [x21], #0x2\n"
"ld1 { v13.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
"ldr x20, [x13, #0x28]\n"
"add x20, x20, x28\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v12.8h\n"
@@ -635,14 +635,14 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"51:" // Oddments: Load input (3, 2): Bit 2: End
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
+ "fmin v29.8h, v29.8h, v26.8h\n"
+ "fmin v30.8h, v30.8h, v26.8h\n"
+ "fmin v31.8h, v31.8h, v26.8h\n"
"tbz %x[n_channels], #2, 53f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -687,7 +687,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"56:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 75368dfcf9..6bbd3508cb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 2b1dc3646d..4e64a2bf2b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -120,9 +120,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"add x9, x11, x8\n"
"add x28, x15, x22, LSL #1\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v14.8h }, [x20]\n"
"add x27, x10, x25, LSL #1\n"
"add x26, x9, x8\n"
"add x25, x28, x22, LSL #1\n"
@@ -130,7 +130,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x24\n"
"cbz x23, 4f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x24, x23, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -149,304 +149,304 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr q13, [x13, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"add x24, x24, #0x10\n"
"cmp x24, x23, LSL #4\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "ldr q16, [x14, #0x0]\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x10]\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "ldr q31, [x14, #0x0]\n"
+ "fmla v29.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v20.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v3.8h, v18.8h\n"
+ "fmla v22.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x13]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ld1 { v18.8h }, [x10]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v29.8h, v8.8h, v23.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v5.8h, v23.8h\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.8h, v0.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v18.8h\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v19.8h\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v17.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.8h, v8.8h, v17.8h\n"
"add x13, x13, #0x10\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
"add x10, x10, #0x10\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
"add x16, x16, #0x10\n"
"ld1 { v10.8h }, [x16]\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
"ldr q4, [x14, #0x50]\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "ld1 { v12.8h }, [x12]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
"ldr q1, [x14, #0x20]\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
"add x12, x12, #0x10\n"
"ldr q9, [x12, x11]\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v0.8h, v18.8h\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v21.8h, v2.8h, v17.8h\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x27, x11]\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v28.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
"ldr q3, [x14, #0x40]\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
"ldr q11, [x16, x26]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
"ldr q8, [x14, #0x90]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
"ldr q7, [x14, #0x80]\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
"ldr q13, [x13, x11]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
"add x27, x27, #0x10\n"
"ld1 { v12.8h }, [x27]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
"add x14, x14, #0xa0\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "st1 { v23.8h }, [x15]\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "str q24, [x15, x17]\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "str q25, [x15, x22]\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "st1 { v28.8h }, [x15]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
"st1 { v26.8h }, [x28]\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.8h }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x10]\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v20.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v3.8h, v18.8h\n"
+ "fmla v22.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x13]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ld1 { v18.8h }, [x10]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v29.8h, v8.8h, v23.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v5.8h, v23.8h\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.8h, v0.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v18.8h\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v19.8h\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v17.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.8h, v8.8h, v17.8h\n"
"add x13, x13, #0x10\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
"add x10, x10, #0x10\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
"add x16, x16, #0x10\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "ld1 { v12.8h }, [x12]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmla v20.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v0.8h, v18.8h\n"
"add x12, x12, #0x10\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x27, x11]\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmla v21.8h, v2.8h, v17.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmla v28.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
"add x27, x27, #0x10\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "st1 { v23.8h }, [x15]\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "str q24, [x15, x17]\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "str q25, [x15, x22]\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "st1 { v28.8h }, [x15]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
"st1 { v26.8h }, [x28]\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.8h }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 93f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"add x24, x12, x11\n"
"add x23, x16, XZR\n"
@@ -509,18 +509,18 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr h12, [x21, #0x0]\n"
"ldr h13, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
"add x20, x27, x26\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v29.8h, v6.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
@@ -1009,25 +1009,25 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"88:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 90f\n"
"mov x22, x15\n"
"mov x21, x28\n"
@@ -1134,7 +1134,6 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"st1 { v28.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"92:" // Tile loop: Oddments: Store: Bit 2: End
-
"93:" // Tile loop: End
"ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -1149,7 +1148,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 878aa29bcf..72e68482c6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -87,405 +87,405 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x3\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x7, #0x10\n" // cntb _, ALL, #1
+ "lsr x8, %x[n_channels], #0x3\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr q13, [x27, x13]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "sub x13, XZR, x7\n"
+ "cbz x8, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x7, x8, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr q10, [x20, x14]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x15, #0x30]\n"
+ "ldr x23, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v6.8h, v17.8h\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "ldr q16, [x15, #0x0]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0x68]\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "ldr x25, [x14, #0x78]\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "ldr x11, [x14, #0x80]\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
- "ldr x27, [x14, #0x20]\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x23, x14]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "str q23, [x23, x12]\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "ldr q5, [x15, #0x60]\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
- "ldr q8, [x15, #0x90]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "ldr q7, [x15, #0x80]\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
- "ldr q13, [x27, x8]\n"
- "ldr q6, [x15, #0x70]\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "ldr x23, [x16, #0x20]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x8]\n"
- "ldr q10, [x10, x8]\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x8]\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "ldr q12, [x28, x8]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "str q24, [x22, x12]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "str q25, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "str q26, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x8, x8, #0x10\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "cmp x8, x17, LSL #4\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "add x13, x13, #0x10\n"
- "str q28, [x22, x12]\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x21, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x22, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "ldr q31, [x16, #0x0]\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "fmla v28.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla v26.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr x21, [x15, #0x80]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v26.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v22.8h, v1.8h, v19.8h\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v23.8h, v3.8h, v16.8h\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v29.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v28.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.8h, v5.8h, v17.8h\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr x20, [x15, #0x20]\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v29.8h, v2.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v22.8h, v4.8h, v16.8h\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr q17, [x21, x14]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.8h, v0.8h, v18.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v22.8h, v2.8h, v17.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v22.8h, v6.8h, v16.8h\n"
+ "ldr q13, [x20, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x24, [x17, #0x20]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x7]\n"
+ "ldr q10, [x20, x7]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x7]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "ldr q12, [x20, x7]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q28, [x9, x13]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q27, [x28, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "str q26, [x27, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x7, x7, #0x10\n"
+ "str q25, [x24, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "cmp x7, x8, LSL #4\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "add x14, x14, #0x10\n"
+ "str q24, [x23, x13]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q23, [x22, x13]\n"
+ "add x16, x16, #0xa0\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x23, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v6.8h, v17.8h\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v13.8h\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0x68]\n"
- "ldr x25, [x14, #0x78]\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v26.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v3.8h, v11.8h\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.8h, v2.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.8h, v5.8h, v10.8h\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.8h, v0.8h, v11.8h\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v24.8h, v8.8h, v10.8h\n"
- "fmla v25.8h, v7.8h, v10.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.8h, v6.8h, v12.8h\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
- "fmla v23.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v28.8h, v8.8h, v11.8h\n"
- "fmla v30.8h, v6.8h, v13.8h\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "fmla v27.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.8h, v5.8h, v11.8h\n"
- "fmla v25.8h, v4.8h, v11.8h\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.8h, v8.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v12.8h\n"
- "fmla v27.8h, v6.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x22, x14]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
- "fmla v28.8h, v7.8h, v13.8h\n"
- "fmla v30.8h, v5.8h, v13.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "str q23, [x23, x12]\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr x23, [x16, #0x20]\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.8h, v3.8h, v12.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "str q24, [x22, x12]\n"
- "fmla v29.8h, v8.8h, v13.8h\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "ldr x22, [x16, #0x28]\n"
- "fmla v31.8h, v6.8h, v13.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "str q25, [x21, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "str q26, [x20, x12]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "add x13, x13, #0x10\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x22, x12]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x21, x12]\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x21, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v28.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v26.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla v26.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v22.8h, v1.8h, v19.8h\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v23.8h, v3.8h, v16.8h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "ldr x22, [x15, #0xc0]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v29.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v28.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v28.8h, v5.8h, v17.8h\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v29.8h, v2.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v22.8h, v4.8h, v16.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v2.8h, v17.8h\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q28, [x9, x13]\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x23, [x17, #0x28]\n"
+ "fmla v22.8h, v6.8h, v16.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "str q27, [x28, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q26, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "str q24, [x23, x13]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q23, [x22, x13]\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 92f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x24, [x14, #0x0]\n"
- "ldr x23, [x14, #0x8]\n"
- "add x24, x24, x13\n"
- "add x23, x23, x13\n"
- "ldr x22, [x14, #0x10]\n"
- "ldr x21, [x14, #0x18]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x13, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "ldr x24, [x15, #0x0]\n"
+ "ldr x23, [x15, #0x8]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "ldr x22, [x15, #0x10]\n"
+ "ldr x21, [x15, #0x18]\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
"ld1 { v10.d }[0], [x23], #0x8\n"
@@ -534,19 +534,19 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ld1 { v12.h }[0], [x21], #0x2\n"
"ld1 { v13.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
- "mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "ldr x20, [x14, #0x28]\n"
- "add x20, x20, x13\n"
- "mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "add x20, x20, x14\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v29.8h, v6.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
@@ -574,9 +574,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (4, 4): Bit 2: End
- "ldr x20, [x14, #0x30]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -597,10 +597,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
@@ -625,10 +625,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x15, #0x40]\n"
"fmla v23.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -649,10 +649,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 3): Bit 2: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -673,10 +673,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
@@ -701,10 +701,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v23.8h, v3.8h, v11.8h\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 33f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -725,10 +725,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -749,10 +749,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -773,10 +773,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
@@ -801,10 +801,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 49f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
@@ -825,10 +825,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v29.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 53f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
@@ -849,10 +849,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (1, 1): Bit 2: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 57f\n"
@@ -875,10 +875,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
"fmla v25.8h, v4.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 61f\n"
@@ -901,10 +901,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 65f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -925,10 +925,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 69f\n"
@@ -951,10 +951,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"70:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v25.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 73f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -976,10 +976,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"74:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v27.8h, v8.8h, v13.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 77f\n"
@@ -1002,10 +1002,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"78:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 81f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1027,10 +1027,10 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"82:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -1054,236 +1054,234 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"87:" // Oddments: Load input (4, 2): Bit 2: End
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v23.8h, v23.8h, v17.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "fmin v25.8h, v25.8h, v17.8h\n"
- "fmin v26.8h, v26.8h, v17.8h\n"
- "fmin v27.8h, v27.8h, v17.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 89f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.d }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.d }[0], [x22]\n"
- "st1 { v25.d }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x8\n"
- "st1 { v28.d }[0], [x22]\n"
- "st1 { v29.d }[0], [x21]\n"
- "st1 { v30.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 88f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[2], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x4\n"
- "st1 { v28.s }[2], [x22]\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 91f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[6], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[6], [x22]\n"
- "st1 { v25.h }[6], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[6], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[6], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[6], [x22]\n"
- "st1 { v29.h }[6], [x21]\n"
- "st1 { v30.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[6], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 91f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[4], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[4], [x22]\n"
- "st1 { v25.h }[4], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[4], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[4], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[4], [x22]\n"
- "st1 { v29.h }[4], [x21]\n"
- "st1 { v30.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[4], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 90f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[0], [x22]\n"
- "st1 { v25.s }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x4\n"
- "st1 { v28.s }[0], [x22]\n"
- "st1 { v29.s }[0], [x21]\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 91f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[2], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[2], [x22]\n"
- "st1 { v25.h }[2], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[2], [x22]\n"
- "st1 { v29.h }[2], [x21]\n"
- "st1 { v30.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.h }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.h }[0], [x22]\n"
- "st1 { v25.h }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.h }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.h }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.h }[0], [x22]\n"
- "st1 { v29.h }[0], [x21]\n"
- "st1 { v30.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[0], [x23]\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"91:" // Oddments: Store: Bit 2: End
-
"92:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 4f0de6b61c..04fb532937 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index a3a372be05..a1e1dd0e99 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -124,9 +124,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x22, LSL #1\n"
"add x23, x5, x5\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x26, x9, x24, LSL #1\n"
"add x25, x28, x4\n"
"add x24, x27, x22, LSL #1\n"
@@ -134,7 +134,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x6\n"
"cbz x13, 4f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x6, x13, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -152,499 +152,499 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr q12, [x14, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"add x6, x6, #0x10\n"
"cmp x6, x13, LSL #4\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v3.8h, v9.8h\n"
+ "mov v22.16b, v14.16b\n fmla v22.8h, v1.8h, v9.8h\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v7.8h, v9.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v6.8h, v9.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v5.8h, v9.8h\n"
+ "mov v20.16b, v14.16b\n fmla v20.8h, v2.8h, v9.8h\n"
"ldr q9, [x12, x17]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ld1 { v30.8h }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q27, [x26, x25]\n"
+ "fmla v16.8h, v4.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v23.8h, v1.8h, v12.8h\n"
+ "mov v21.16b, v14.16b\n fmla v21.8h, v6.8h, v30.8h\n"
"ldr q10, [x12, x11]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "mov v24.16b, v14.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x7, x4]\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v8.8h, v27.8h\n"
+ "ldr q12, [x7, x28]\n"
+ "fmla v16.8h, v6.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "ldr q14, [x16, #0x0]\n"
+ "fmla v31.8h, v8.8h, v9.8h\n"
+ "fmla v20.8h, v5.8h, v9.8h\n"
+ "fmla v21.8h, v2.8h, v9.8h\n"
"ld1 { v9.8h }, [x15]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v25.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x15, x25]\n"
+ "fmla v17.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "fmla v16.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v23.8h, v4.8h, v10.8h\n"
+ "fmla v19.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v18.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v0.8h, v10.8h\n"
"ldr q10, [x15, x17]\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v20.8h, v6.8h, v12.8h\n"
+ "fmla v21.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x11]\n"
+ "fmla v25.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v3.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "fmla v19.8h, v8.8h, v12.8h\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q9, [x26, x4]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
"ldr q10, [x14, x4]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x28]\n"
+ "fmla v21.8h, v7.8h, v9.8h\n"
+ "fmla v27.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x26, x28]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v7.8h, v10.8h\n"
+ "fmla v25.8h, v6.8h, v10.8h\n"
"ldr q10, [x7, x17]\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x12, x4]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr q9, [x12, x4]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v4.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
"ldr q12, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
+ "fmla v31.8h, v7.8h, v9.8h\n"
+ "fmla v26.8h, v6.8h, v9.8h\n"
+ "fmla v20.8h, v4.8h, v9.8h\n"
+ "fmla v22.8h, v3.8h, v9.8h\n"
+ "fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x12, x28]\n"
+ "fmla v28.8h, v2.8h, v10.8h\n"
+ "fmla v25.8h, v1.8h, v10.8h\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
"ld1 { v10.8h }, [x14]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v0.8h, v10.8h\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v24.8h, v7.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v19.8h, v4.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
"ldr q11, [x9, x17]\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "fmla v17.8h, v1.8h, v12.8h\n"
"ldr q12, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v28.8h, v6.8h, v10.8h\n"
"ld1 { v10.8h }, [x12]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v4.8h, v11.8h\n"
+ "fmla v18.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v2.8h, v12.8h\n"
"ldr q12, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v21.8h, v0.8h, v10.8h\n"
"ldr q10, [x26, x17]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v18.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
"ldr q11, [x9, x11]\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
"ldr q12, [x26, x11]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
"ldr q10, [x15, x4]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v11.8h\n"
"add x26, x26, #0x10\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v6.8h, v11.8h\n"
"ldr q11, [x15, x28]\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v27.8h, v8.8h, v12.8h\n"
"add x15, x15, #0x10\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v18.8h, v7.8h, v12.8h\n"
+ "fmla v30.8h, v6.8h, v12.8h\n"
"ldr q12, [x9, x4]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v25.8h, v3.8h, v10.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
"ldr q10, [x9, x28]\n"
- "ldr q9, [x14, x17]\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v17.8h, v5.8h, v11.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
"add x9, x9, #0x10\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "ldr q13, [x16, #0x0]\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
+ "fmla v16.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
"ldr q11, [x7, x25]\n"
- "ldr q0, [x16, #0x10]\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v21.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v3.8h, v12.8h\n"
"ldr q12, [x14, x11]\n"
- "ldr q2, [x16, #0x30]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q6, [x16, #0x70]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
+ "fmla v23.8h, v8.8h, v10.8h\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v19.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v18.8h, v5.8h, v10.8h\n"
"ldr q5, [x16, #0x60]\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
"ld1 { v10.8h }, [x7]\n"
"ldr q4, [x16, #0x50]\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "st1 { v16.8h }, [x8]\n"
- "ldr q7, [x16, #0x80]\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "str q17, [x8, x5]\n"
- "ldr q8, [x16, #0x90]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q18, [x8, x23]\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
"add x16, x16, #0xa0\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q19, [x8, x22]\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "st1 { v28.8h }, [x8]\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q25, [x8, x5]\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "str q17, [x8, x23]\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmin v24.8h, v24.8h, v15.8h\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "st1 { v20.8h }, [x10]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "str q21, [x10, x5]\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q22, [x10, x23]\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "str q23, [x10, x22]\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v15.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "str q26, [x10, x5]\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "str q16, [x10, x23]\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "str q24, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.8h }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v20.8h }, [x27]\n"
+ "str q22, [x27, x5]\n"
+ "str q23, [x27, x23]\n"
+ "str q19, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.8h }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v21.8h }, [x24]\n"
+ "str q27, [x24, x5]\n"
+ "str q18, [x24, x23]\n"
+ "str q30, [x24, x22]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x17]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x12, x11]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v3.8h, v9.8h\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x12, x17]\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v21.8h }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q20, [x26, x25]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v6.8h, v21.8h\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v16.8h, v7.8h, v24.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x9]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x15]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
- "ldr q10, [x15, x17]\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
- "ldr q10, [x14, x4]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
- "ldr q10, [x7, x17]\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x12, x4]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
- "ldr q12, [x7, x11]\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "mov v11.16b, v14.16b\n fmla v11.8h, v3.8h, v12.8h\n"
+ "mov v10.16b, v14.16b\n fmla v10.8h, v0.8h, v12.8h\n"
+ "ldr q22, [x7, x4]\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v8.8h, v20.8h\n"
+ "ldr q21, [x7, x28]\n"
+ "fmla v31.8h, v6.8h, v24.8h\n"
+ "fmla v30.8h, v4.8h, v24.8h\n"
+ "fmla v18.8h, v3.8h, v24.8h\n"
+ "mov v12.16b, v14.16b\n fmla v12.8h, v1.8h, v24.8h\n"
+ "fmla v14.8h, v0.8h, v24.8h\n"
+ "fmla v28.8h, v8.8h, v24.8h\n"
+ "fmla v27.8h, v5.8h, v24.8h\n"
+ "fmla v26.8h, v2.8h, v24.8h\n"
+ "ld1 { v24.8h }, [x15]\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v17.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x15, x25]\n"
+ "fmla v19.8h, v2.8h, v21.8h\n"
+ "fmla v29.8h, v1.8h, v21.8h\n"
+ "ld1 { v20.8h }, [x9]\n"
+ "fmla v31.8h, v7.8h, v9.8h\n"
+ "fmla v11.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v5.8h, v9.8h\n"
+ "fmla v18.8h, v4.8h, v9.8h\n"
+ "fmla v10.8h, v3.8h, v9.8h\n"
+ "fmla v12.8h, v2.8h, v9.8h\n"
+ "fmla v14.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v0.8h, v9.8h\n"
+ "ldr q21, [x15, x17]\n"
+ "fmla v28.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v20.8h\n"
+ "fmla v26.8h, v3.8h, v20.8h\n"
+ "ldr q20, [x9, x25]\n"
+ "fmla v16.8h, v1.8h, v21.8h\n"
+ "fmla v23.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x15, x11]\n"
+ "fmla v17.8h, v4.8h, v21.8h\n"
+ "fmla v19.8h, v3.8h, v21.8h\n"
+ "fmla v31.8h, v0.8h, v21.8h\n"
+ "fmla v10.8h, v8.8h, v20.8h\n"
+ "fmla v25.8h, v5.8h, v20.8h\n"
+ "ldr q20, [x26, x4]\n"
+ "fmla v28.8h, v2.8h, v21.8h\n"
+ "fmla v16.8h, v2.8h, v22.8h\n"
+ "fmla v23.8h, v5.8h, v21.8h\n"
+ "ldr q21, [x14, x4]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v11.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x14, x28]\n"
+ "fmla v26.8h, v7.8h, v20.8h\n"
+ "fmla v12.8h, v6.8h, v20.8h\n"
+ "ldr q20, [x26, x28]\n"
+ "fmla v28.8h, v4.8h, v21.8h\n"
+ "fmla v16.8h, v3.8h, v21.8h\n"
+ "fmla v27.8h, v1.8h, v21.8h\n"
+ "fmla v30.8h, v0.8h, v21.8h\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "fmla v17.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x7, x17]\n"
+ "fmla v14.8h, v8.8h, v20.8h\n"
+ "fmla v25.8h, v7.8h, v20.8h\n"
+ "ldr q20, [x12, x4]\n"
+ "fmla v19.8h, v8.8h, v22.8h\n"
+ "fmla v29.8h, v7.8h, v22.8h\n"
+ "fmla v31.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v4.8h, v22.8h\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
+ "fmla v10.8h, v1.8h, v22.8h\n"
+ "ldr q22, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x14]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x9, x17]\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "ldr q12, [x14, x25]\n"
+ "fmla v28.8h, v7.8h, v20.8h\n"
+ "fmla v16.8h, v6.8h, v20.8h\n"
+ "fmla v27.8h, v4.8h, v20.8h\n"
+ "fmla v30.8h, v3.8h, v20.8h\n"
+ "fmla v26.8h, v1.8h, v20.8h\n"
+ "fmla v12.8h, v0.8h, v20.8h\n"
+ "ldr q20, [x12, x28]\n"
+ "fmla v23.8h, v2.8h, v21.8h\n"
+ "fmla v17.8h, v1.8h, v21.8h\n"
+ "fmla v19.8h, v0.8h, v21.8h\n"
+ "ld1 { v21.8h }, [x14]\n"
+ "fmla v14.8h, v2.8h, v20.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "fmla v27.8h, v0.8h, v21.8h\n"
+ "fmla v31.8h, v8.8h, v20.8h\n"
+ "fmla v11.8h, v7.8h, v20.8h\n"
+ "fmla v18.8h, v5.8h, v20.8h\n"
+ "fmla v10.8h, v4.8h, v20.8h\n"
+ "fmla v25.8h, v1.8h, v20.8h\n"
+ "ldr q24, [x9, x17]\n"
+ "fmla v17.8h, v2.8h, v22.8h\n"
+ "fmla v19.8h, v1.8h, v22.8h\n"
+ "ldr q20, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
- "ld1 { v10.8h }, [x12]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
- "ldr q12, [x12, x25]\n"
+ "fmla v23.8h, v6.8h, v21.8h\n"
+ "ld1 { v21.8h }, [x12]\n"
+ "fmla v12.8h, v4.8h, v24.8h\n"
+ "fmla v14.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v8.8h, v20.8h\n"
+ "fmla v11.8h, v5.8h, v20.8h\n"
+ "fmla v10.8h, v2.8h, v20.8h\n"
+ "ldr q20, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x26, x17]\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "ldr q11, [x9, x11]\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x26, x11]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x15, x4]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v21.8h\n"
+ "fmla v27.8h, v3.8h, v21.8h\n"
+ "fmla v26.8h, v0.8h, v21.8h\n"
+ "ldr q22, [x26, x17]\n"
+ "fmla v25.8h, v2.8h, v20.8h\n"
+ "fmla v12.8h, v7.8h, v22.8h\n"
+ "fmla v14.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v8.8h, v24.8h\n"
+ "fmla v30.8h, v7.8h, v24.8h\n"
+ "fmla v18.8h, v6.8h, v24.8h\n"
+ "fmla v26.8h, v5.8h, v24.8h\n"
+ "ldr q21, [x9, x11]\n"
+ "fmla v10.8h, v5.8h, v20.8h\n"
+ "fmla v12.8h, v5.8h, v21.8h\n"
+ "fmla v14.8h, v4.8h, v21.8h\n"
+ "fmla v25.8h, v3.8h, v21.8h\n"
+ "fmla v11.8h, v8.8h, v20.8h\n"
+ "ldr q20, [x26, x11]\n"
+ "fmla v26.8h, v8.8h, v22.8h\n"
+ "ldr q9, [x15, x4]\n"
+ "fmla v30.8h, v8.8h, v21.8h\n"
+ "fmla v18.8h, v7.8h, v21.8h\n"
"add x26, x26, #0x10\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
- "ldr q11, [x15, x28]\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v10.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x15, x28]\n"
+ "fmla v12.8h, v8.8h, v20.8h\n"
"add x15, x15, #0x10\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x4]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x28]\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmla v14.8h, v7.8h, v20.8h\n"
+ "fmla v25.8h, v6.8h, v20.8h\n"
+ "ldr q24, [x9, x4]\n"
+ "fmla v23.8h, v4.8h, v9.8h\n"
+ "fmla v17.8h, v3.8h, v9.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v16.8h, v0.8h, v9.8h\n"
+ "ldr q0, [x9, x28]\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v19.8h, v5.8h, v21.8h\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
"add x9, x9, #0x10\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "st1 { v16.8h }, [x8]\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmla v31.8h, v2.8h, v21.8h\n"
+ "fmla v11.8h, v1.8h, v21.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmla v27.8h, v7.8h, v24.8h\n"
+ "fmla v30.8h, v6.8h, v24.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v26.8h, v4.8h, v24.8h\n"
+ "fmla v12.8h, v3.8h, v24.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v18.8h, v8.8h, v0.8h\n"
+ "fmla v10.8h, v7.8h, v0.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmla v14.8h, v5.8h, v0.8h\n"
+ "fmla v25.8h, v4.8h, v0.8h\n"
+ "fmax v11.8h, v11.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v12.8h, v12.8h, v13.8h\n"
+ "fmax v14.8h, v14.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "st1 { v23.8h }, [x8]\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
"str q17, [x8, x5]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q18, [x8, x23]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q19, [x8, x22]\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "str q19, [x8, x23]\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v11.8h, v11.8h, v15.8h\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "st1 { v20.8h }, [x10]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "str q21, [x10, x5]\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q22, [x10, x23]\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "str q23, [x10, x22]\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "st1 { v28.8h }, [x10]\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v10.8h, v10.8h, v15.8h\n"
+ "str q16, [x10, x5]\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "fmin v12.8h, v12.8h, v15.8h\n"
+ "str q31, [x10, x23]\n"
+ "fmin v14.8h, v14.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "str q11, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.8h }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v27.8h }, [x27]\n"
+ "str q30, [x27, x5]\n"
+ "str q18, [x27, x23]\n"
+ "str q10, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.8h }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v26.8h }, [x24]\n"
+ "str q12, [x24, x5]\n"
+ "str q14, [x24, x23]\n"
+ "str q25, [x24, x22]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 141f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"add x23, x14, x17\n"
"add x22, x7, XZR\n"
@@ -699,27 +699,27 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr h11, [x21, #0x0]\n"
"ldr h12, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
"add x20, x26, XZR\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v14.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v14.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v14.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v14.16b\n fmla v24.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v0.8h, v10.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
"fmla v22.8h, v4.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 10f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -740,7 +740,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"11:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
"ldr h10, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v6.8h, v10.8h\n"
"add x20, x26, x25\n"
"tbz %x[n_channels], #2, 14f\n"
"ldr d11, [x20], #0x8\n"
@@ -762,7 +762,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"15:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
"ldr h11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v8.8h, v11.8h\n"
"add x20, x12, x17\n"
"tbz %x[n_channels], #2, 18f\n"
"ldr d9, [x20], #0x8\n"
@@ -792,8 +792,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 22f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
@@ -1513,40 +1513,40 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"136:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v24.8h, v24.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 138f\n"
"mov x23, x8\n"
"mov x22, x10\n"
@@ -1712,7 +1712,6 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v27.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"140:" // Tile loop: Oddments: Store: Bit 2: End
-
"141:" // Tile loop: End
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 16326150fd..96feeeeece 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -98,629 +98,629 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x3\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "lsr x7, %x[n_channels], #0x3\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v14.8h }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "sub x14, XZR, x6\n"
+ "cbz x7, 3f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x6, x7, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr q10, [x20, x15]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x15]\n"
+ "ldr q12, [x20, x15]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "add x12, x12, #0x10\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
+ "mov v23.16b, v30.16b\n fmla v23.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "mov v15.16b, v30.16b\n fmla v15.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v12.8h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x23, x15]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "fmla v16.8h, v8.8h, v12.8h\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v15.8h, v7.8h, v12.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v23.8h, v7.8h, v9.8h\n"
+ "fmla v10.8h, v6.8h, v12.8h\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v3.8h, v12.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v18.8h\n"
+ "ldr q12, [x26, x15]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmla v20.8h, v3.8h, v9.8h\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "ldr q30, [x17, #0x0]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x25, x15]\n"
+ "fmla v17.8h, v1.8h, v11.8h\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v16.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x21, x15]\n"
+ "fmla v15.8h, v2.8h, v12.8h\n"
+ "ldr x21, [x16, #0x98]\n"
+ "fmla v23.8h, v8.8h, v22.8h\n"
+ "fmla v10.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v22.8h\n"
+ "fmla v21.8h, v6.8h, v22.8h\n"
+ "fmla v28.8h, v5.8h, v22.8h\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v19.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v22.8h\n"
+ "fmla v18.8h, v1.8h, v22.8h\n"
+ "fmla v24.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "ldr q9, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v16.8h, v4.8h, v22.8h\n"
+ "fmla v15.8h, v3.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v10.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v2.8h, v11.8h\n"
+ "ldr q12, [x22, x15]\n"
+ "fmla v25.8h, v0.8h, v22.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v8.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
+ "ldr q11, [x20, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v27.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "fmla v15.8h, v4.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
+ "fmla v10.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v21.8h, v0.8h, v12.8h\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x13]\n"
- "ldr q9, [x11, x8]\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "ldr q13, [x15, #0x0]\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q1, [x15, #0x20]\n"
+ "ldr q12, [x27, x15]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v16.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v4.8h, v22.8h\n"
+ "fmla v23.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v28.8h, v0.8h, v22.8h\n"
+ "ldr q11, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v15.8h, v8.8h, v9.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
"fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "ldr q6, [x15, #0x70]\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q7, [x15, #0x80]\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
+ "ldr q12, [x25, x15]\n"
+ "fmla v19.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v10.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v5.8h, v9.8h\n"
+ "fmla v21.8h, v4.8h, v9.8h\n"
+ "fmla v20.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v15.8h, v0.8h, v11.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v27.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0xf8]\n"
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v26.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "fmla v19.8h, v4.8h, v11.8h\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v18.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v2.8h, v9.8h\n"
+ "fmla v15.8h, v1.8h, v9.8h\n"
+ "fmla v10.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x20, x15]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x22, [x16, #0x110]\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q12, [x28, x15]\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "ldr x21, [x16, #0x118]\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v26.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v10.8h, v8.8h, v9.8h\n"
+ "fmla v21.8h, v5.8h, v9.8h\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v27.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v20.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v22.8h\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x24, x15]\n"
+ "fmla v29.8h, v8.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v18.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x20, x15]\n"
+ "ldp x20, x24, [x16, #0x0]\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v21.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v17.8h, v4.8h, v22.8h\n"
+ "fmla v16.8h, v3.8h, v22.8h\n"
+ "fmla v15.8h, v5.8h, v12.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v10.8h, v4.8h, v12.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v18.8h, v7.8h, v11.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "fmax v15.8h, v15.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v23.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmla v20.8h, v8.8h, v22.8h\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v19.8h, v7.8h, v22.8h\n"
+ "ldr q7, [x17, #0x80]\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "str q16, [x23, x12]\n"
- "ldr q8, [x15, #0x90]\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "str q17, [x22, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "str q18, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "str q19, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q10, [x10, x8]\n"
- "ldr q4, [x15, #0x50]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x23, x12]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmin v15.8h, v15.8h, v14.8h\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "str q16, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "str q15, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "str q10, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v26.8h, v3.8h, v11.8h\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmla v18.8h, v5.8h, v22.8h\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v24.8h, v4.8h, v22.8h\n"
+ "ldr q10, [x24, x6]\n"
+ "ldr q4, [x17, #0x50]\n"
"fmin v23.8h, v23.8h, v14.8h\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
"fmin v25.8h, v25.8h, v14.8h\n"
- "ldr q11, [x9, x8]\n"
- "ldr q12, [x28, x8]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "str q24, [x23, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "str q25, [x22, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "str q26, [x21, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x8, x8, #0x10\n"
- "cmp x8, x17, LSL #4\n"
- "str q27, [x20, x12]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q27, [x23, x14]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "str q23, [x22, x14]\n"
+ "ldr x25, [x8, #0x40]\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "str q25, [x21, x14]\n"
+ "ldr x23, [x8, #0x48]\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "str q21, [x20, x14]\n"
+ "ldr x22, [x8, #0x50]\n"
+ "ldr x24, [x8, #0x58]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x6]\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
"fmin v28.8h, v28.8h, v14.8h\n"
+ "ldr q12, [x20, x6]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q31, [x25, x14]\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "str q28, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "str q20, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "str q19, [x24, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x7, LSL #4\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "add x13, x13, #0x10\n"
- "str q28, [x23, x12]\n"
- "str q29, [x22, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q29, [x23, x14]\n"
+ "add x17, x17, #0xa0\n"
+ "str q26, [x22, x14]\n"
+ "str q18, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v12.8h\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.8h, v4.8h, v12.8h\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.8h, v6.8h, v9.8h\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.8h, v4.8h, v9.8h\n"
- "fmla v26.8h, v3.8h, v9.8h\n"
- "add x12, x12, #0x10\n"
- "fmla v20.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "fmla v16.8h, v1.8h, v12.8h\n"
- "fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.8h, v7.8h, v10.8h\n"
- "fmla v23.8h, v6.8h, v10.8h\n"
- "fmla v25.8h, v5.8h, v10.8h\n"
- "fmla v26.8h, v4.8h, v10.8h\n"
- "fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.8h, v4.8h, v10.8h\n"
- "fmla v18.8h, v3.8h, v10.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v5.8h, v10.8h\n"
- "fmla v20.8h, v2.8h, v10.8h\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v21.8h, v2.8h, v12.8h\n"
- "fmla v19.8h, v3.8h, v12.8h\n"
- "fmla v22.8h, v1.8h, v12.8h\n"
- "fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.8h, v7.8h, v11.8h\n"
- "fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v17.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v4.8h, v10.8h\n"
- "fmla v21.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v10.8h\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v30.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.8h, v1.8h, v12.8h\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v2.8h, v12.8h\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.8h, v2.8h, v10.8h\n"
- "fmla v17.8h, v1.8h, v10.8h\n"
- "fmla v18.8h, v0.8h, v10.8h\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.8h, v7.8h, v11.8h\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.8h, v6.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v25.8h, v3.8h, v11.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.8h, v2.8h, v11.8h\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v5.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.8h, v2.8h, v12.8h\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v3.8h, v10.8h\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v11.8h\n"
- "fmla v27.8h, v5.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v7.8h, v10.8h\n"
- "fmla v30.8h, v6.8h, v10.8h\n"
- "fmla v24.8h, v8.8h, v11.8h\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.8h, v8.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v11.8h\n"
- "fmla v27.8h, v6.8h, v11.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q11, [x10, x13]\n"
- "fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.8h, v4.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmla v18.8h, v5.8h, v11.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x13]\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "fmla v22.8h, v2.8h, v11.8h\n"
- "fmla v23.8h, v1.8h, v11.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v15.16b, v30.16b\n fmla v15.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v6.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q16, [x23, x15]\n"
+ "fmla v15.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v19.8h, v1.8h, v12.8h\n"
+ "fmla v20.8h, v8.8h, v12.8h\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v21.8h, v7.8h, v12.8h\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v31.8h, v7.8h, v24.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v9.16b, v30.16b\n fmla v9.8h, v3.8h, v12.8h\n"
+ "mov v11.16b, v30.16b\n fmla v11.8h, v0.8h, v12.8h\n"
+ "ldr q23, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v12.16b, v30.16b\n fmla v12.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x26, x15]\n"
+ "fmla v15.8h, v6.8h, v24.8h\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla v29.8h, v4.8h, v24.8h\n"
+ "fmla v19.8h, v3.8h, v24.8h\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v24.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v0.8h, v24.8h\n"
+ "fmla v18.8h, v8.8h, v24.8h\n"
+ "fmla v27.8h, v5.8h, v24.8h\n"
+ "fmla v10.8h, v2.8h, v24.8h\n"
+ "ldr q24, [x25, x15]\n"
+ "fmla v17.8h, v1.8h, v23.8h\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v20.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v31.8h, v8.8h, v22.8h\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v15.8h, v7.8h, v22.8h\n"
+ "fmla v9.8h, v6.8h, v22.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v11.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v22.8h\n"
+ "fmla v25.8h, v1.8h, v22.8h\n"
+ "fmla v12.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.8h, v3.8h, v24.8h\n"
+ "fmla v18.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v16.8h\n"
+ "fmla v10.8h, v3.8h, v16.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v21.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v28.8h, v5.8h, v23.8h\n"
+ "fmla v9.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v15.8h, v0.8h, v22.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v11.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x21, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ "fmla v20.8h, v5.8h, v23.8h\n"
+ "fmla v21.8h, v4.8h, v23.8h\n"
+ "fmla v31.8h, v2.8h, v23.8h\n"
+ "fmla v28.8h, v3.8h, v23.8h\n"
+ "fmla v15.8h, v1.8h, v23.8h\n"
+ "fmla v9.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v10.8h, v7.8h, v16.8h\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x27, x15]\n"
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v20.8h, v6.8h, v22.8h\n"
+ "fmla v18.8h, v4.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v21.8h, v8.8h, v23.8h\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v11.8h, v1.8h, v23.8h\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v23.8h\n"
+ "fmla v15.8h, v5.8h, v23.8h\n"
+ "fmla v9.8h, v4.8h, v23.8h\n"
+ "fmla v19.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v17.8h, v2.8h, v22.8h\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "fmla v21.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v18.8h, v7.8h, v16.8h\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v31.8h, v6.8h, v16.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v29.8h, v3.8h, v16.8h\n"
+ "fmla v10.8h, v1.8h, v16.8h\n"
+ "fmla v26.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x22, x15]\n"
+ "fmla v11.8h, v4.8h, v16.8h\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v28.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x21, x15]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "fmla v18.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v15.8h, v8.8h, v16.8h\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla v9.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "fmla v12.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x28, x15]\n"
+ "fmla v11.8h, v2.8h, v23.8h\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v10.8h, v0.8h, v22.8h\n"
+ "fmla v26.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v3.8h, v16.8h\n"
+ "fmla v28.8h, v8.8h, v23.8h\n"
+ "fmla v9.8h, v5.8h, v23.8h\n"
+ "ldr q23, [x27, x15]\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v29.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v6.8h, v16.8h\n"
+ "fmla v10.8h, v5.8h, v16.8h\n"
+ "fmla v11.8h, v5.8h, v23.8h\n"
+ "fmla v12.8h, v2.8h, v23.8h\n"
+ "fmla v26.8h, v7.8h, v22.8h\n"
+ "fmla v25.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v10.8h, v8.8h, v22.8h\n"
+ "ldr q30, [x23, x15]\n"
+ "fmla v29.8h, v8.8h, v16.8h\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmla v11.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v4.8h, v16.8h\n"
+ "fmla v12.8h, v3.8h, v16.8h\n"
+ "ldr q24, [x22, x15]\n"
+ "fmla v9.8h, v8.8h, v23.8h\n"
+ "ldr q16, [x24, x15]\n"
+ "fmla v17.8h, v4.8h, v30.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v20.8h, v3.8h, v30.8h\n"
+ "fmla v21.8h, v5.8h, v24.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v24.8h\n"
+ "fmla v26.8h, v8.8h, v16.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmla v25.8h, v7.8h, v16.8h\n"
+ "fmla v12.8h, v6.8h, v16.8h\n"
+ "ldr q23, [x21, x15]\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v18.8h, v1.8h, v30.8h\n"
+ "fmla v31.8h, v0.8h, v30.8h\n"
+ "ldr q16, [x20, x15]\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "str q16, [x23, x12]\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "fmla v25.8h, v6.8h, v12.8h\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "str q17, [x22, x12]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "str q18, [x21, x12]\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "str q19, [x20, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v15.8h, v2.8h, v24.8h\n"
+ "fmla v9.8h, v1.8h, v24.8h\n"
"fmin v20.8h, v20.8h, v14.8h\n"
- "fmla v30.8h, v5.8h, v10.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
+ "str q17, [x12, x14]\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v29.8h, v6.8h, v23.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x23, x12]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "str q24, [x23, x12]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "str q25, [x22, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "str q26, [x21, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
- "str q27, [x20, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q20, [x11, x14]\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "fmla v11.8h, v7.8h, v16.8h\n"
"fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q28, [x23, x12]\n"
- "fmin v30.8h, v30.8h, v14.8h\n"
+ "str q21, [x10, x14]\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "str q28, [x9, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmax v15.8h, v15.8h, v13.8h\n"
+ "fmax v9.8h, v9.8h, v13.8h\n"
+ "ldr x22, [x8, #0x28]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v10.8h, v4.8h, v23.8h\n"
+ "fmla v26.8h, v3.8h, v23.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmla v25.8h, v5.8h, v16.8h\n"
+ "fmla v12.8h, v4.8h, v16.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
- "str q29, [x22, x12]\n"
- "add x13, x13, #0x10\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "str q18, [x23, x14]\n"
+ "fmin v15.8h, v15.8h, v14.8h\n"
+ "fmin v9.8h, v9.8h, v14.8h\n"
+ "str q31, [x22, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "str q15, [x21, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v11.8h, v11.8h, v13.8h\n"
+ "str q9, [x20, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q27, [x23, x14]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v11.8h, v11.8h, v14.8h\n"
+ "str q29, [x22, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "str q19, [x21, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v12.8h, v12.8h, v13.8h\n"
+ "str q11, [x20, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q10, [x23, x14]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v12.8h, v12.8h, v14.8h\n"
+ "str q26, [x22, x14]\n"
+ "add x15, x15, #0x10\n"
+ "str q25, [x21, x14]\n"
+ "str q12, [x20, x14]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 140f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x23, [x14, #0x0]\n"
- "ldr x22, [x14, #0x8]\n"
- "add x23, x23, x13\n"
- "add x22, x22, x13\n"
- "ldr x21, [x14, #0x10]\n"
- "ldr x20, [x14, #0x18]\n"
- "add x21, x21, x13\n"
- "add x20, x20, x13\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "mov x14, x15\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x23], #0x8\n"
"ld1 { v10.d }[0], [x22], #0x8\n"
@@ -762,28 +762,28 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ld1 { v11.h }[0], [x21], #0x2\n"
"ld1 { v12.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
- "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
- "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "ldr x20, [x16, #0x20]\n"
+ "add x20, x20, x15\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v0.8h, v10.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
"fmla v22.8h, v4.8h, v12.8h\n"
- "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "mov v23.16b, v30.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
- "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 9f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
@@ -804,9 +804,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (5, 0): Bit 2: End
- "ldr x20, [x14, #0x28]\n"
- "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x28]\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -827,9 +827,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (5, 5): Bit 2: End
- "ldr x20, [x14, #0x30]\n"
- "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 17f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
@@ -850,17 +850,17 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v9.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v21.8h, v7.8h, v9.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.8h, v6.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -881,10 +881,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -905,10 +905,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (0, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (0, 4): Bit 2: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x16, #0x48]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 29f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
@@ -929,10 +929,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v22.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
@@ -960,10 +960,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v9.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -984,10 +984,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (1, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (1, 5): Bit 2: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x16, #0x60]\n"
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -1008,10 +1008,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (4, 0): Bit 2: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x16, #0x68]\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 45f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
@@ -1032,10 +1032,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (1, 2): Bit 2: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x16, #0x70]\n"
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
@@ -1060,10 +1060,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 5): Bit 2: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmla v27.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 53f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
@@ -1084,10 +1084,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x16, #0x80]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
@@ -1112,10 +1112,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (5, 1): Bit 2: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 61f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
@@ -1136,10 +1136,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x16, #0x90]\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
@@ -1164,10 +1164,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (5, 4): Bit 2: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 69f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -1188,10 +1188,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"70:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x16, #0xa0]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
@@ -1216,10 +1216,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"74:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x16, #0xa8]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 77f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1241,10 +1241,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"78:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x16, #0xb0]\n"
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
@@ -1269,10 +1269,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"82:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (0, 3): Bit 2: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1294,10 +1294,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"86:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"87:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x16, #0xc0]\n"
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 89f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1319,10 +1319,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"90:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"91:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x20, [x14, #0xc8]\n"
+ "ldr x20, [x16, #0xc8]\n"
"fmla v22.8h, v8.8h, v11.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v11.8h\n"
@@ -1347,10 +1347,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"94:" // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"95:" // Oddments: Load input (2, 5): Bit 2: End
- "ldr x20, [x14, #0xd0]\n"
+ "ldr x20, [x16, #0xd0]\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 97f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1372,10 +1372,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"98:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"99:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x20, [x14, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 101f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1397,10 +1397,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"102:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"103:" // Oddments: Load input (4, 2): Bit 2: End
- "ldr x20, [x14, #0xe0]\n"
+ "ldr x20, [x16, #0xe0]\n"
"fmla v24.8h, v8.8h, v11.8h\n"
"fmla v25.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
@@ -1425,10 +1425,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"106:" // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"107:" // Oddments: Load input (3, 5): Bit 2: End
- "ldr x20, [x14, #0xe8]\n"
+ "ldr x20, [x16, #0xe8]\n"
"fmla v23.8h, v8.8h, v12.8h\n"
"fmla v27.8h, v5.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 109f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1450,10 +1450,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"110:" // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"111:" // Oddments: Load input (5, 2): Bit 2: End
- "ldr x20, [x14, #0xf0]\n"
+ "ldr x20, [x16, #0xf0]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"tbz %x[n_channels], #2, 113f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1475,10 +1475,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"114:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"115:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
@@ -1503,10 +1503,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"118:" // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"119:" // Oddments: Load input (5, 3): Bit 2: End
- "ldr x20, [x14, #0x100]\n"
+ "ldr x20, [x16, #0x100]\n"
"fmla v29.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.8h, v6.8h, v12.8h\n"
"tbz %x[n_channels], #2, 121f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1528,10 +1528,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"122:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"123:" // Oddments: Load input (1, 1): Bit 2: End
- "ldr x20, [x14, #0x108]\n"
+ "ldr x20, [x16, #0x108]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 125f\n"
@@ -1554,10 +1554,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"126:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"127:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x20, [x14, #0x110]\n"
+ "ldr x20, [x16, #0x110]\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 129f\n"
@@ -1580,10 +1580,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"130:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"131:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x20, [x14, #0x118]\n"
+ "ldr x20, [x16, #0x118]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 133f\n"
@@ -1608,24 +1608,24 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"135:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmax v30.8h, v30.8h, v15.8h\n"
- "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
"fmin v16.8h, v16.8h, v14.8h\n"
"fmin v17.8h, v17.8h, v14.8h\n"
"fmin v18.8h, v18.8h, v14.8h\n"
@@ -1643,150 +1643,150 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 137f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.d }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.d }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.d }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.d }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.d }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.d }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.d }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.d }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.d }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.d }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x8\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
"st1 { v28.d }[0], [x23]\n"
"st1 { v29.d }[0], [x22]\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 136f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[2], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[2], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[2], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[2], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[2], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[2], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[2], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[2], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[2], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[2], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x4\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
"st1 { v28.s }[2], [x23]\n"
"st1 { v29.s }[2], [x22]\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[6], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[6], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[6], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[6], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[6], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[6], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[6], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[6], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[6], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[6], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[6], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[6], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[6], [x23]\n"
"st1 { v29.h }[6], [x22]\n"
"st1 { v30.h }[6], [x21]\n"
@@ -1794,50 +1794,50 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"b 139f\n"
"136:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 139f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[4], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[4], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[4], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[4], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[4], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[4], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[4], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[4], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[4], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[4], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[4], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[4], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[4], [x23]\n"
"st1 { v29.h }[4], [x22]\n"
"st1 { v30.h }[4], [x21]\n"
@@ -1845,161 +1845,159 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"b 139f\n"
"137:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 138f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x4\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
"st1 { v28.s }[0], [x23]\n"
"st1 { v29.s }[0], [x22]\n"
"st1 { v30.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[2], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[2], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[2], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[2], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[2], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[2], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[2], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[2], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[2], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[2], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[2], [x23]\n"
"st1 { v29.h }[2], [x22]\n"
"st1 { v30.h }[2], [x21]\n"
"st1 { v31.h }[2], [x20]\n"
"b 139f\n"
"138:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.h }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.h }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.h }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.h }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.h }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.h }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.h }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.h }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.h }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.h }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.h }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.h }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.h }[0], [x23]\n"
"st1 { v29.h }[0], [x22]\n"
"st1 { v30.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"139:" // Oddments: Store: Bit 2: End
-
"140:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index d52f48064f..8ad6a37fea 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 268dda531d..8954999990 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -118,9 +118,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x11, x13, x6\n"
"add x17, x17, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x10, x12, x24, LSL #1\n"
"add x9, x11, x6\n"
"add x28, x17, x21, LSL #1\n"
@@ -128,7 +128,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x15, #0x20]\n"
@@ -150,179 +150,179 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr q16, [x8, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
"add x23, x23, #0x10\n"
"add x8, x8, #0x10\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
"ld1 { v10.8h }, [x8]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "ld1 { v14.8h }, [x12]\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q21, [x16, x9]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ld1 { v20.8h }, [x12]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
"add x16, x16, #0x10\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "ld1 { v15.8h }, [x14]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q17, [x15, #0x0]\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ld1 { v25.8h }, [x14]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x12, x6]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q18, [x14, x6]\n"
+ "fmla v28.8h, v5.8h, v21.8h\n"
+ "ldr q24, [x14, x11]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x15, #0x0]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x12, x11]\n"
+ "fmla v29.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x11]\n"
"add x20, x20, #0x10\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
+ "fmla v23.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
+ "ldr q21, [x10, x6]\n"
+ "fmla v23.8h, v0.8h, v25.8h\n"
"ldr q0, [x15, #0x10]\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v22.8h, v1.8h, v24.8h\n"
"add x21, x21, #0x10\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "ldr q11, [x14, x9]\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "ldr q20, [x14, x9]\n"
"ldr q4, [x15, #0x50]\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "ld1 { v15.8h }, [x10]\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
+ "fmla v22.8h, v5.8h, v16.8h\n"
+ "ldr q19, [x10, x11]\n"
+ "fmla v29.8h, v6.8h, v25.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
"ldr q1, [x15, #0x20]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v22.8h, v2.8h, v20.8h\n"
"ldr q2, [x15, #0x30]\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v29.8h, v7.8h, v18.8h\n"
"ldr q16, [x12, x13]\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
+ "fmla v23.8h, v6.8h, v17.8h\n"
+ "ldr q18, [x10, x13]\n"
+ "fmla v22.8h, v3.8h, v16.8h\n"
"ldr q3, [x15, #0x40]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
"ldr q13, [x8, x9]\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
+ "fmla v22.8h, v7.8h, v19.8h\n"
"ld1 { v14.8h }, [x16]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v24.8h\n"
"ldr q12, [x8, x11]\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
"ldr q16, [x8, x13]\n"
"ldr q5, [x15, #0x60]\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x10, x9]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "fmla v28.8h, v8.8h, v20.8h\n"
+ "ldr q17, [x10, x9]\n"
"ldr q6, [x15, #0x70]\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v17.8h\n"
"ldr q11, [x8, x6]\n"
"ldr q15, [x16, x6]\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v23.8h, v23.8h, v26.8h\n"
+ "fmax v22.8h, v22.8h, v26.8h\n"
"add x14, x14, #0x10\n"
"ldr q9, [x14, x13]\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v23.8h, v23.8h, v27.8h\n"
+ "fmin v22.8h, v22.8h, v27.8h\n"
"add x12, x12, #0x10\n"
"add x10, x10, #0x10\n"
- "st1 { v28.8h }, [x17]\n"
+ "st1 { v29.8h }, [x17]\n"
"add x15, x15, #0xa0\n"
- "str q29, [x17, x7]\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "st1 { v30.8h }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "st1 { v23.8h }, [x28]\n"
+ "str q22, [x28, x7]\n"
"add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
"add x8, x8, #0x10\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "ld1 { v14.8h }, [x12]\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x16, x9]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ld1 { v19.8h }, [x12]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
"add x16, x16, #0x10\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "ld1 { v15.8h }, [x14]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x12, x11]\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "ldr q11, [x14, x9]\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "ld1 { v15.8h }, [x10]\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ld1 { v25.8h }, [x14]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q18, [x12, x6]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q24, [x14, x6]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "ldr q23, [x14, x11]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v29.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x11]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v21.8h, v4.8h, v17.8h\n"
+ "ldr q20, [x10, x6]\n"
+ "fmla v22.8h, v0.8h, v25.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v22.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x14, x9]\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "ldr q18, [x10, x11]\n"
+ "fmla v29.8h, v6.8h, v25.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v22.8h, v1.8h, v24.8h\n"
"add x14, x14, #0x10\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v24.8h\n"
"ldr q16, [x12, x13]\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
- "st1 { v28.8h }, [x17]\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v22.8h, v6.8h, v17.8h\n"
+ "ldr q17, [x10, x13]\n"
+ "fmla v21.8h, v3.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmla v22.8h, v7.8h, v20.8h\n"
+ "fmla v21.8h, v7.8h, v18.8h\n"
+ "st1 { v29.8h }, [x17]\n"
"add x12, x12, #0x10\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x10, x9]\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmla v28.8h, v7.8h, v23.8h\n"
+ "fmla v22.8h, v5.8h, v16.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x10, x9]\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmla v22.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "fmax v22.8h, v22.8h, v26.8h\n"
"add x10, x10, #0x10\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "str q29, [x17, x7]\n"
+ "fmax v21.8h, v21.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
- "st1 { v30.8h }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "fmin v22.8h, v22.8h, v27.8h\n"
+ "fmin v21.8h, v21.8h, v27.8h\n"
+ "st1 { v22.8h }, [x28]\n"
+ "str q21, [x28, x7]\n"
"add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 81f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"add x27, x14, x13\n"
"add x26, x8, XZR\n"
@@ -409,17 +409,17 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h15, [x21, #0x0]\n"
"ldr h16, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"add x20, x16, x11\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v3.8h, v14.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
"fmla v28.8h, v4.8h, v15.8h\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 10f\n"
@@ -802,14 +802,14 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h11, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v30.8h, v30.8h, v26.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v27.8h\n"
"tbz %x[n_channels], #2, 78f\n"
"mov x21, x17\n"
"mov x20, x28\n"
@@ -871,7 +871,6 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"80:" // Tile loop: Oddments: Store: Bit 2: End
-
"81:" // Tile loop: End
"ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -886,7 +885,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 144d11fb39..6ae0b30afd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -88,258 +88,258 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "lsr x25, %x[n_channels], #0x3\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x25, #0x10\n" // cntb _, ALL, #1
+ "lsr x24, %x[n_channels], #0x3\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.8h }, [x20]\n"
+ "ld1r { v26.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
- "sub x23, XZR, x26\n"
- "cbz x25, 3f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "cmp x26, x25, LSL #4\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
- "add x24, x24, #0xa0\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x28]\n"
+ "sub x22, XZR, x25\n"
+ "cbz x24, 3f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "cmp x25, x24, LSL #4\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "add x23, x23, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
"ldr q10, [x20, x28]\n"
"ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
"ldr q12, [x20, x28]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x28]\n"
- "ldr q14, [x21, x28]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x28]\n"
+ "ldr q14, [x20, x28]\n"
"ldp x21, x20, [x13, #0x30]\n"
"ldr q15, [x21, x28]\n"
"ldr q16, [x20, x28]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v8.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q19, [x21, x28]\n"
+ "fmla v23.8h, v2.8h, v13.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v24.8h, v3.8h, v14.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q17, [x24, #0x0]\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x20, x28]\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
- "ldr q0, [x24, #0x10]\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.8h, v4.8h, v15.8h\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q22, [x20, x28]\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v23.8h, v5.8h, v20.8h\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q21, [x20, x28]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+ "mov v19.16b, v31.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x23, #0x0]\n"
+ "fmla v24.8h, v5.8h, v18.8h\n"
+ "fmla v23.8h, v3.8h, v18.8h\n"
"ldr q16, [x21, x28]\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q4, [x24, #0x50]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v19.8h, v4.8h, v16.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v20.8h, v0.8h, v22.8h\n"
+ "ldr q0, [x23, #0x10]\n"
+ "fmla v19.8h, v1.8h, v21.8h\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v20.8h, v4.8h, v18.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "ldr q4, [x23, #0x50]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
- "ldr q11, [x20, x28]\n"
- "ldr q1, [x24, #0x20]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr x21, [x13, #0x90]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q15, [x21, x28]\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "ldr q13, [x22, x28]\n"
- "ldr q3, [x24, #0x40]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr x21, [x13, #0xb0]\n"
- "add x23, x23, #0x10\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "ldr q14, [x21, x28]\n"
+ "fmla v24.8h, v6.8h, v22.8h\n"
+ "fmla v20.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q1, [x23, #0x20]\n"
+ "fmla v19.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v17.8h\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v19.8h, v3.8h, v17.8h\n"
+ "fmax v23.8h, v23.8h, v26.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmla v20.8h, v5.8h, v17.8h\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "add x22, x22, #0x10\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0xb8]\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "ldr q15, [x20, x28]\n"
- "ldr q7, [x24, #0x80]\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q8, [x24, #0x90]\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x26]\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmin v23.8h, v23.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "fmla v19.8h, v6.8h, v16.8h\n"
+ "fmla v20.8h, v8.8h, v16.8h\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmax v20.8h, v20.8h, v26.8h\n"
+ "fmin v20.8h, v20.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x23, #0x90]\n"
+ "fmax v19.8h, v19.8h, v26.8h\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x25]\n"
+ "fmin v19.8h, v19.8h, v27.8h\n"
"add x28, x28, #0x10\n"
- "ldr q10, [x20, x26]\n"
+ "ldr q10, [x20, x25]\n"
"ldp x21, x20, [x13, #0x10]\n"
- "str q28, [x12, x23]\n"
- "add x24, x24, #0xa0\n"
- "ldr q11, [x21, x26]\n"
- "ldr q12, [x20, x26]\n"
- "str q29, [x11, x23]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x26]\n"
- "str q30, [x10, x23]\n"
- "ldr q14, [x21, x26]\n"
+ "str q24, [x12, x22]\n"
+ "add x23, x23, #0xa0\n"
+ "ldr q11, [x21, x25]\n"
+ "ldr q12, [x20, x25]\n"
+ "str q23, [x11, x22]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x25]\n"
+ "str q20, [x10, x22]\n"
+ "ldr q14, [x20, x25]\n"
"ldp x21, x20, [x13, #0x30]\n"
- "str q31, [x9, x23]\n"
- "ldr q15, [x21, x26]\n"
- "ldr q16, [x20, x26]\n"
- "add x26, x26, #0x10\n"
- "cmp x26, x25, LSL #4\n"
+ "str q19, [x9, x22]\n"
+ "ldr q15, [x21, x25]\n"
+ "ldr q16, [x20, x25]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x25, x24, LSL #4\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v8.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "fmla v24.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "fmla v24.8h, v2.8h, v13.8h\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v25.8h, v3.8h, v14.8h\n"
+ "fmla v24.8h, v0.8h, v16.8h\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.8h, v4.8h, v15.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x20, x28]\n"
- "fmla v30.8h, v3.8h, v14.8h\n"
- "fmla v31.8h, v4.8h, v13.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.8h, v0.8h, v15.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v25.8h, v4.8h, v15.8h\n"
+ "fmla v24.8h, v4.8h, v18.8h\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q23, [x20, x28]\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v5.8h, v20.8h\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q22, [x20, x28]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v2.8h, v9.8h\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v24.8h, v3.8h, v19.8h\n"
"ldr q16, [x21, x28]\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v20.8h, v4.8h, v16.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v0.8h, v23.8h\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v30.8h, v4.8h, v11.8h\n"
- "ldr q11, [x20, x28]\n"
- "fmla v31.8h, v5.8h, v14.8h\n"
- "fmla v28.8h, v6.8h, v15.8h\n"
- "ldr x21, [x13, #0x90]\n"
- "ldr q15, [x21, x28]\n"
- "fmla v30.8h, v1.8h, v16.8h\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "ldr q13, [x22, x28]\n"
- "fmla v30.8h, v6.8h, v15.8h\n"
- "fmla v31.8h, v3.8h, v16.8h\n"
- "ldr x21, [x13, #0xb0]\n"
- "ldr q14, [x21, x28]\n"
- "fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
+ "fmla v21.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v20.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v6.8h, v23.8h\n"
+ "ldr x20, [x13, #0x90]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v1.8h, v17.8h\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.8h, v2.8h, v19.8h\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v20.8h, v3.8h, v18.8h\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
"ldr x20, [x13, #0xb8]\n"
- "ldr q15, [x20, x28]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "fmla v30.8h, v5.8h, v16.8h\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x22, x28]\n"
- "fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "add x23, x23, #0x10\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.8h, v7.8h, v22.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v20.8h, v6.8h, v17.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "fmla v20.8h, v8.8h, v16.8h\n"
+ "fmax v25.8h, v25.8h, v26.8h\n"
+ "add x22, x22, #0x10\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "fmax v21.8h, v21.8h, v26.8h\n"
"add x28, x28, #0x10\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "str q28, [x12, x23]\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "str q29, [x11, x23]\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
- "str q30, [x10, x23]\n"
- "str q31, [x9, x23]\n"
+ "fmax v20.8h, v20.8h, v26.8h\n"
+ "fmin v25.8h, v25.8h, v27.8h\n"
+ "str q25, [x12, x22]\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "fmin v21.8h, v21.8h, v27.8h\n"
+ "str q24, [x11, x22]\n"
+ "fmin v20.8h, v20.8h, v27.8h\n"
+ "str q21, [x10, x22]\n"
+ "str q20, [x9, x22]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 80f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "mov x23, x28\n"
- "add x12, x12, x23\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "add x9, x9, x23\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
"ldr x27, [x13, #0x0]\n"
"ldr x26, [x13, #0x8]\n"
"add x27, x27, x28\n"
@@ -425,18 +425,18 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v15.h }[0], [x21], #0x2\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
- "mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"ldr x20, [x13, #0x40]\n"
"add x20, x20, x28\n"
- "mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v3.8h, v14.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
"fmla v28.8h, v4.8h, v15.8h\n"
- "mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 9f\n"
@@ -835,14 +835,14 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v11.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
- "fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
- "fmin v30.8h, v30.8h, v18.8h\n"
- "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v30.8h, v30.8h, v26.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v27.8h\n"
"tbz %x[n_channels], #2, 77f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -887,7 +887,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"80:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 81a608e349..1d1d491c28 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 8807f5d306..cecaf79704 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -120,9 +120,9 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"add x13, x15, x2\n"
"add x5, x5, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x12, x14, x24, LSL #1\n"
"add x11, x13, x2\n"
"add x10, x5, x21, LSL #1\n"
@@ -130,7 +130,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x8, #0x20]\n"
@@ -150,366 +150,366 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1 { v14.8h }, [x17]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+ "ldr q23, [x7, x15]\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v6.8h\n"
"add x23, x23, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x8, #0x0]\n"
- "ldr q16, [x8, #0x140]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x140]\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
"add x7, x7, #0x10\n"
- "fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
"ldr q1, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q18, [x4, x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
"add x4, x4, #0x10\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x8, #0x20]\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v23.8h\n"
+ "ldr q17, [x8, #0x20]\n"
"add x20, x20, #0x10\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
"add x21, x21, #0x10\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
+ "fmla v29.8h, v3.8h, v23.8h\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "ldr q16, [x8, #0x30]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v31.8h, v4.8h, v18.8h\n"
+ "ldr q0, [x17, x15]\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q20, [x8, #0x40]\n"
+ "fmla v30.8h, v19.8h, v7.8h\n"
"ld1 { v7.8h }, [x7]\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x17, x11]\n"
- "fmla v29.8h, v1.8h, v13.8h\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v31.8h, v19.8h, v8.8h\n"
+ "fmla v29.8h, v19.8h, v14.8h\n"
+ "fmla v28.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "ldr q26, [x17, x11]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v2.8h\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v30.8h, v17.8h, v13.8h\n"
+ "ldr q1, [x17, x13]\n"
+ "fmla v31.8h, v17.8h, v23.8h\n"
"add x17, x17, #0x10\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ld1 { v5.8h }, [x16]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x16, x2]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v17.8h, v2.8h\n"
+ "fmla v28.8h, v17.8h, v0.8h\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v30.8h, v16.8h, v23.8h\n"
+ "ld1 { v24.8h }, [x16]\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "fmla v29.8h, v16.8h, v0.8h\n"
+ "fmla v28.8h, v16.8h, v1.8h\n"
+ "ldr q16, [x8, #0x80]\n"
+ "fmla v30.8h, v20.8h, v21.8h\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v31.8h, v20.8h, v10.8h\n"
+ "ldr q22, [x16, x6]\n"
+ "fmla v29.8h, v20.8h, v1.8h\n"
+ "fmla v28.8h, v20.8h, v26.8h\n"
+ "ldr q21, [x8, #0x90]\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "ldr q5, [x16, x11]\n"
+ "fmla v31.8h, v19.8h, v6.8h\n"
+ "fmla v29.8h, v19.8h, v24.8h\n"
+ "fmla v28.8h, v19.8h, v23.8h\n"
+ "ldr q11, [x8, #0xa0]\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v31.8h, v18.8h, v2.8h\n"
+ "fmla v29.8h, v18.8h, v23.8h\n"
+ "fmla v28.8h, v18.8h, v22.8h\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
"add x16, x16, #0x10\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ld1 { v9.8h }, [x14]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v17.8h, v22.8h\n"
+ "fmla v28.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v30.8h, v16.8h, v0.8h\n"
+ "ld1 { v0.8h }, [x14]\n"
+ "fmla v31.8h, v16.8h, v1.8h\n"
+ "fmla v29.8h, v16.8h, v20.8h\n"
+ "fmla v28.8h, v16.8h, v19.8h\n"
+ "ldr q16, [x8, #0xd0]\n"
+ "fmla v30.8h, v21.8h, v1.8h\n"
+ "ldr q4, [x14, x2]\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "ldr q12, [x14, x13]\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "fmla v28.8h, v21.8h, v5.8h\n"
+ "ldr q13, [x8, #0xe0]\n"
+ "fmla v30.8h, v11.8h, v24.8h\n"
+ "ldr q6, [x14, x6]\n"
+ "fmla v31.8h, v11.8h, v23.8h\n"
+ "fmla v29.8h, v11.8h, v0.8h\n"
+ "fmla v28.8h, v11.8h, v4.8h\n"
+ "ldr q24, [x8, #0xf0]\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "ldr q26, [x14, x15]\n"
+ "fmla v31.8h, v18.8h, v22.8h\n"
+ "fmla v29.8h, v18.8h, v4.8h\n"
+ "fmla v28.8h, v18.8h, v6.8h\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v30.8h, v17.8h, v22.8h\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v31.8h, v17.8h, v20.8h\n"
"add x14, x14, #0x10\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ld1 { v11.8h }, [x12]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v17.8h, v6.8h\n"
+ "fmla v28.8h, v17.8h, v26.8h\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v30.8h, v16.8h, v20.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v31.8h, v16.8h, v19.8h\n"
+ "fmla v29.8h, v16.8h, v26.8h\n"
+ "fmla v28.8h, v16.8h, v12.8h\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v30.8h, v13.8h, v19.8h\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v31.8h, v13.8h, v5.8h\n"
"ld1 { v14.8h }, [x17]\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x130]\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v13.8h, v12.8h\n"
+ "fmla v28.8h, v13.8h, v22.8h\n"
+ "ldr q19, [x8, #0x130]\n"
+ "fmla v30.8h, v24.8h, v0.8h\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v31.8h, v24.8h, v4.8h\n"
+ "fmla v29.8h, v24.8h, v18.8h\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v28.8h, v24.8h, v17.8h\n"
"ldr q0, [x8, #0x150]\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v23.8h, v4.8h\n"
"ldr q13, [x7, x6]\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v23.8h, v6.8h\n"
+ "fmla v29.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v28.8h, v23.8h, v16.8h\n"
"ldr q1, [x8, #0x160]\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v21.8h, v6.8h\n"
"ld1 { v5.8h }, [x4]\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v29.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v28.8h, v21.8h, v18.8h\n"
"ldr q2, [x8, #0x170]\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v20.8h, v26.8h\n"
"ldr q6, [x4, x2]\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
+ "fmla v31.8h, v20.8h, v12.8h\n"
"add x12, x12, #0x10\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v20.8h, v18.8h\n"
"ldr q11, [x4, x15]\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v20.8h, v17.8h\n"
"ldr q3, [x8, #0x180]\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v19.8h, v12.8h\n"
"ldr q8, [x7, x2]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v19.8h, v22.8h\n"
"ldr q10, [x7, x11]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v19.8h, v17.8h\n"
"ldr q12, [x4, x13]\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
+ "fmla v28.8h, v19.8h, v16.8h\n"
"ldr q9, [x4, x6]\n"
"ldr q4, [x8, #0x190]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"add x8, x8, #0x1a0\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x5]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x5, x3]\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "st1 { v30.8h }, [x5]\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "str q31, [x5, x3]\n"
"add x5, x5, #0x10\n"
- "st1 { v30.8h }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "st1 { v29.8h }, [x10]\n"
+ "str q28, [x10, x3]\n"
"add x10, x10, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x8, #0x0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q22, [x7, x15]\n"
+ "mov v5.16b, v25.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v5.8h, v1.8h, v9.8h\n"
"add x7, x7, #0x10\n"
"fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x8, #0x10]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "add x4, x4, #0x10\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x8, #0x20]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x17, x11]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
- "add x17, x17, #0x10\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q18, [x8, #0x10]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ld1 { v5.8h }, [x16]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x16, x2]\n"
+ "ldr q16, [x4, x11]\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
+ "add x4, x4, #0x10\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v22.8h\n"
+ "ldr q17, [x8, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v5.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v21.8h\n"
+ "ldr q20, [x8, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v5.8h, v4.8h, v16.8h\n"
+ "ldr q28, [x17, x15]\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q16, [x8, #0x40]\n"
+ "fmla v31.8h, v19.8h, v7.8h\n"
+ "fmla v5.8h, v19.8h, v8.8h\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "fmla v29.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v31.8h, v18.8h, v8.8h\n"
+ "ldr q1, [x17, x11]\n"
+ "fmla v5.8h, v18.8h, v13.8h\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "fmla v29.8h, v18.8h, v2.8h\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v31.8h, v17.8h, v13.8h\n"
+ "ldr q26, [x17, x13]\n"
+ "fmla v5.8h, v17.8h, v22.8h\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "fmla v29.8h, v17.8h, v28.8h\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ld1 { v25.8h }, [x16]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
+ "fmla v30.8h, v20.8h, v28.8h\n"
+ "fmla v29.8h, v20.8h, v26.8h\n"
+ "ldr q24, [x8, #0x80]\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v5.8h, v16.8h, v10.8h\n"
+ "ldr q0, [x16, x6]\n"
+ "fmla v30.8h, v16.8h, v26.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q22, [x8, #0x90]\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v5.8h, v19.8h, v6.8h\n"
+ "fmla v30.8h, v19.8h, v25.8h\n"
+ "fmla v29.8h, v19.8h, v23.8h\n"
+ "ldr q21, [x8, #0xa0]\n"
+ "fmla v31.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "fmla v29.8h, v18.8h, v0.8h\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v31.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
"add x16, x16, #0x10\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ld1 { v9.8h }, [x14]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "fmla v29.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v31.8h, v24.8h, v28.8h\n"
+ "ld1 { v7.8h }, [x14]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
+ "fmla v30.8h, v24.8h, v20.8h\n"
+ "fmla v29.8h, v24.8h, v19.8h\n"
+ "ldr q2, [x8, #0xd0]\n"
+ "fmla v31.8h, v22.8h, v26.8h\n"
+ "ldr q28, [x14, x2]\n"
+ "fmla v5.8h, v22.8h, v1.8h\n"
+ "ldr q13, [x14, x13]\n"
+ "fmla v30.8h, v22.8h, v19.8h\n"
+ "fmla v29.8h, v22.8h, v16.8h\n"
+ "ldr q14, [x8, #0xe0]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q26, [x14, x6]\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v29.8h, v21.8h, v28.8h\n"
+ "ldr q25, [x8, #0xf0]\n"
+ "fmla v31.8h, v18.8h, v23.8h\n"
+ "ldr q24, [x14, x15]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
+ "fmla v30.8h, v18.8h, v28.8h\n"
+ "fmla v29.8h, v18.8h, v26.8h\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v5.8h, v17.8h, v20.8h\n"
"add x14, x14, #0x10\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ld1 { v11.8h }, [x12]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x8, #0x130]\n"
+ "fmla v30.8h, v17.8h, v26.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v31.8h, v2.8h, v20.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v5.8h, v2.8h, v19.8h\n"
+ "fmla v30.8h, v2.8h, v24.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v31.8h, v14.8h, v19.8h\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v5.8h, v14.8h, v16.8h\n"
+ "fmla v30.8h, v14.8h, v13.8h\n"
+ "fmla v29.8h, v14.8h, v22.8h\n"
+ "ldr q19, [x8, #0x130]\n"
"add x8, x8, #0x140\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v25.8h, v7.8h\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v5.8h, v25.8h, v28.8h\n"
+ "fmla v30.8h, v25.8h, v18.8h\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v29.8h, v25.8h, v17.8h\n"
+ "fmla v31.8h, v23.8h, v28.8h\n"
+ "fmla v5.8h, v23.8h, v26.8h\n"
+ "fmla v30.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v5.8h, v21.8h, v24.8h\n"
+ "fmla v30.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
"add x12, x12, #0x10\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x5]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x5, x3]\n"
+ "fmla v31.8h, v20.8h, v24.8h\n"
+ "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v30.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v5.8h, v19.8h, v22.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmla v30.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v5.8h, v5.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v5.8h, v5.8h, v15.8h\n"
+ "st1 { v31.8h }, [x5]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q5, [x5, x3]\n"
"add x5, x5, #0x10\n"
"st1 { v30.8h }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "str q29, [x10, x3]\n"
"add x10, x10, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 117f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"add x9, x4, XZR\n"
"add x28, x4, x2\n"
@@ -609,11 +609,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h10, [x21, #0x0]\n"
"ldr h14, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v6.8h\n"
"add x20, x7, x15\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
"fmla v30.8h, v1.8h, v8.8h\n"
@@ -1294,14 +1294,14 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h9, [x20, #0x0]\n"
"112:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
"fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 114f\n"
"mov x21, x5\n"
"mov x20, x10\n"
@@ -1363,7 +1363,6 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"116:" // Tile loop: Oddments: Store: Bit 2: End
-
"117:" // Tile loop: End
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -1378,7 +1377,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index a2791d277e..4913340c4c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -103,16 +103,16 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x9, %x[n_channels], #0x3\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v27.8h }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x14, x13, [x21, #0x0]\n"
"ldp x12, x11, [x21, #0x10]\n"
"mov x10, #0x0\n"
"sub x28, XZR, x17\n"
"cbz x9, 3f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x17, x9, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -120,436 +120,436 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldr q5, [x27, x10]\n"
- "ldr q6, [x26, x10]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x10]\n"
- "ldr q8, [x24, x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q9, [x23, x10]\n"
- "ldr q13, [x22, x10]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q5, [x21, x10]\n"
+ "ldr q6, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x10]\n"
+ "ldr q8, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q9, [x21, x10]\n"
+ "ldr q13, [x20, x10]\n"
"ldp x21, x20, [x15, #0x30]\n"
"ldr q11, [x21, x10]\n"
"ldr q12, [x20, x10]\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x10]\n"
- "ldr q14, [x26, x10]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x10]\n"
+ "ldr q14, [x20, x10]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr q16, [x16, #0x140]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr q6, [x24, x10]\n"
- "fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x23, [x15, #0xa0]\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q24, [x20, x10]\n"
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q23, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x140]\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr q22, [x20, x10]\n"
"fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x16, #0x80]\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x16, #0xb0]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
+ "ldr q21, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q17, [x20, x10]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr x27, [x15, #0x100]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v24.8h\n"
+ "ldr q16, [x16, #0x20]\n"
+ "ldr x22, [x15, #0x70]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q5, [x20, x10]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x16, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x23, x10]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v3.8h, v22.8h\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x21, [x15, #0x80]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
+ "ldr q19, [x22, x10]\n"
+ "fmla v31.8h, v4.8h, v17.8h\n"
+ "ldr q2, [x20, x10]\n"
+ "fmla v28.8h, v4.8h, v22.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "ldr q18, [x16, #0x40]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v30.8h, v23.8h, v7.8h\n"
+ "fmla v31.8h, v23.8h, v8.8h\n"
+ "ldr x23, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
+ "fmla v28.8h, v23.8h, v14.8h\n"
+ "fmla v29.8h, v23.8h, v5.8h\n"
+ "ldr q1, [x16, #0x50]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v30.8h, v21.8h, v8.8h\n"
+ "ldr q25, [x20, x10]\n"
+ "fmla v31.8h, v21.8h, v13.8h\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v28.8h, v21.8h, v5.8h\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "ldr q17, [x16, #0x60]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.8h, v16.8h, v13.8h\n"
+ "ldr q8, [x21, x10]\n"
+ "fmla v31.8h, v16.8h, v24.8h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v28.8h, v16.8h, v19.8h\n"
+ "fmla v29.8h, v16.8h, v2.8h\n"
+ "ldr q16, [x16, #0x70]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v30.8h, v20.8h, v24.8h\n"
+ "ldr q24, [x23, x10]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ldr x27, [x15, #0xc8]\n"
+ "fmla v28.8h, v20.8h, v2.8h\n"
+ "fmla v29.8h, v20.8h, v8.8h\n"
+ "ldr q23, [x16, #0x80]\n"
+ "ldr x23, [x15, #0xd0]\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q22, [x26, x10]\n"
+ "fmla v31.8h, v18.8h, v10.8h\n"
+ "ldr q21, [x22, x10]\n"
+ "fmla v28.8h, v18.8h, v8.8h\n"
+ "fmla v29.8h, v18.8h, v25.8h\n"
+ "ldr q20, [x16, #0x90]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v30.8h, v1.8h, v14.8h\n"
+ "ldr q0, [x20, x10]\n"
"fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x130]\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v28.8h, v1.8h, v24.8h\n"
+ "fmla v29.8h, v1.8h, v22.8h\n"
+ "ldr q6, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v30.8h, v17.8h, v5.8h\n"
+ "ldr q1, [x25, x10]\n"
+ "fmla v31.8h, v17.8h, v19.8h\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v28.8h, v17.8h, v22.8h\n"
+ "fmla v29.8h, v17.8h, v21.8h\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.8h, v16.8h, v19.8h\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v31.8h, v16.8h, v2.8h\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v28.8h, v16.8h, v21.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v30.8h, v23.8h, v2.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.8h, v23.8h, v8.8h\n"
+ "ldr x21, [x15, #0x100]\n"
+ "fmla v28.8h, v23.8h, v1.8h\n"
+ "fmla v29.8h, v23.8h, v19.8h\n"
+ "ldr q13, [x16, #0xd0]\n"
+ "fmla v30.8h, v20.8h, v8.8h\n"
+ "ldr q2, [x27, x10]\n"
+ "fmla v31.8h, v20.8h, v25.8h\n"
+ "ldr q10, [x20, x10]\n"
+ "fmla v28.8h, v20.8h, v19.8h\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "ldr q9, [x16, #0xe0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v30.8h, v6.8h, v24.8h\n"
+ "ldr q5, [x23, x10]\n"
+ "fmla v31.8h, v6.8h, v22.8h\n"
+ "ldr x23, [x15, #0x110]\n"
+ "fmla v28.8h, v6.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v2.8h\n"
+ "ldr q24, [x16, #0xf0]\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q25, [x22, x10]\n"
+ "fmla v31.8h, v18.8h, v21.8h\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v28.8h, v18.8h, v2.8h\n"
+ "fmla v29.8h, v18.8h, v5.8h\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v30.8h, v17.8h, v21.8h\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v31.8h, v17.8h, v1.8h\n"
+ "fmla v28.8h, v17.8h, v5.8h\n"
+ "fmla v29.8h, v17.8h, v25.8h\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v30.8h, v13.8h, v1.8h\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v31.8h, v13.8h, v19.8h\n"
+ "fmla v28.8h, v13.8h, v25.8h\n"
+ "fmla v29.8h, v13.8h, v10.8h\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v30.8h, v9.8h, v19.8h\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v31.8h, v9.8h, v0.8h\n"
+ "fmla v28.8h, v9.8h, v10.8h\n"
+ "fmla v29.8h, v9.8h, v22.8h\n"
+ "ldr q19, [x16, #0x130]\n"
+ "fmla v30.8h, v24.8h, v16.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.8h, v24.8h, v2.8h\n"
+ "fmla v28.8h, v24.8h, v18.8h\n"
+ "ldr q18, [x20, x10]\n"
+ "fmla v29.8h, v24.8h, v17.8h\n"
"ldr q0, [x16, #0x150]\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v23.8h, v2.8h\n"
+ "fmla v31.8h, v23.8h, v5.8h\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "fmla v28.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x23, x10]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
"ldr q1, [x16, #0x160]\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
- "ldr q5, [x27, x17]\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v21.8h, v5.8h\n"
+ "ldr q5, [x21, x17]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "fmla v28.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x22, x10]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
"ldr q2, [x16, #0x170]\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
- "ldr q6, [x26, x17]\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x17]\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v20.8h, v25.8h\n"
+ "ldr q6, [x20, x17]\n"
+ "fmla v31.8h, v20.8h, v10.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x17]\n"
+ "fmla v28.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
"ldr q3, [x16, #0x180]\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
- "ldr q8, [x24, x17]\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q13, [x22, x17]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
- "ldr q9, [x23, x17]\n"
+ "fmla v30.8h, v19.8h, v10.8h\n"
+ "ldr q8, [x20, x17]\n"
+ "fmla v31.8h, v19.8h, v22.8h\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x17]\n"
+ "fmla v28.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "ldr q9, [x21, x17]\n"
"ldr q4, [x16, #0x190]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"ldr q11, [x21, x17]\n"
"ldr q12, [x20, x17]\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x17]\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "ldr q14, [x26, x17]\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x17]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "ldr q14, [x20, x17]\n"
"add x17, x17, #0x10\n"
"cmp x17, x9, LSL #4\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
"add x10, x10, #0x10\n"
- "str q28, [x14, x28]\n"
+ "str q30, [x14, x28]\n"
"add x16, x16, #0x1a0\n"
- "str q29, [x13, x28]\n"
- "str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q31, [x13, x28]\n"
+ "str q28, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v1.8h, v9.8h\n"
- "ldr x23, [x15, #0x60]\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "mov v5.16b, v26.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q22, [x20, x10]\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x20, x10]\n"
+ "fmla v5.8h, v1.8h, v9.8h\n"
+ "ldr x21, [x15, #0x60]\n"
"fmla v30.8h, v1.8h, v8.8h\n"
- "fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v28.8h, v2.8h, v9.8h\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v30.8h, v2.8h, v13.8h\n"
- "fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v30.8h, v3.8h, v5.8h\n"
- "fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.8h, v4.8h, v6.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x25, [x15, #0x90]\n"
- "fmla v28.8h, v0.8h, v7.8h\n"
- "fmla v29.8h, v0.8h, v8.8h\n"
- "ldr x24, [x15, #0x98]\n"
- "ldr x23, [x15, #0xa0]\n"
- "fmla v30.8h, v0.8h, v14.8h\n"
- "fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v30.8h, v1.8h, v11.8h\n"
- "fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.8h, v2.8h, v5.8h\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q18, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v28.8h, v3.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v3.8h, v6.8h\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v30.8h, v3.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x16, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.8h, v4.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
+ "ldr x23, [x15, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v22.8h\n"
+ "ldr q17, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x20, x10]\n"
+ "fmla v5.8h, v3.8h, v12.8h\n"
+ "ldr x22, [x15, #0x80]\n"
+ "fmla v30.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v21.8h\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x23, x10]\n"
+ "fmla v5.8h, v4.8h, v16.8h\n"
+ "ldr q28, [x21, x10]\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
- "fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.8h, v0.8h, v14.8h\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.8h, v0.8h, v11.8h\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.8h, v0.8h, v5.8h\n"
- "fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.8h, v1.8h, v6.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x16, #0xb0]\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.8h, v2.8h, v10.8h\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
- "ldr x27, [x15, #0x100]\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
+ "ldr q16, [x16, #0x40]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "fmla v31.8h, v19.8h, v7.8h\n"
+ "fmla v5.8h, v19.8h, v8.8h\n"
+ "ldr x27, [x15, #0x98]\n"
+ "ldr x26, [x15, #0xa0]\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "fmla v29.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x16, #0x50]\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v31.8h, v18.8h, v8.8h\n"
+ "ldr q1, [x20, x10]\n"
+ "fmla v5.8h, v18.8h, v13.8h\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "fmla v29.8h, v18.8h, v2.8h\n"
+ "ldr q18, [x16, #0x60]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.8h, v17.8h, v13.8h\n"
+ "ldr q26, [x22, x10]\n"
+ "fmla v5.8h, v17.8h, v22.8h\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "fmla v29.8h, v17.8h, v28.8h\n"
+ "ldr q17, [x16, #0x70]\n"
+ "ldr x22, [x15, #0xc8]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ldr q25, [x21, x10]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla v30.8h, v20.8h, v28.8h\n"
+ "fmla v29.8h, v20.8h, v26.8h\n"
+ "ldr q24, [x16, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "ldr q23, [x27, x10]\n"
+ "fmla v5.8h, v16.8h, v10.8h\n"
+ "ldr q0, [x26, x10]\n"
+ "fmla v30.8h, v16.8h, v26.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q22, [x16, #0x90]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v5.8h, v19.8h, v6.8h\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v30.8h, v19.8h, v25.8h\n"
+ "fmla v29.8h, v19.8h, v23.8h\n"
+ "ldr q21, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v31.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x25, x10]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "fmla v29.8h, v18.8h, v0.8h\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "fmla v31.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "fmla v29.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v31.8h, v24.8h, v28.8h\n"
+ "ldr q7, [x23, x10]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
+ "ldr x23, [x15, #0x100]\n"
+ "fmla v30.8h, v24.8h, v20.8h\n"
+ "fmla v29.8h, v24.8h, v19.8h\n"
"ldr q3, [x16, #0xd0]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q8, [x23, x10]\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.8h, v0.8h, v5.8h\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.8h, v1.8h, v6.8h\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.8h, v1.8h, v10.8h\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.8h, v1.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.8h, v2.8h, v11.8h\n"
- "fmla v30.8h, v2.8h, v5.8h\n"
- "fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
- "fmla v30.8h, v3.8h, v6.8h\n"
- "fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.8h, v4.8h, v12.8h\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.8h, v4.8h, v14.8h\n"
- "fmla v30.8h, v4.8h, v8.8h\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x16, #0x130]\n"
+ "fmla v31.8h, v22.8h, v26.8h\n"
+ "ldr q28, [x22, x10]\n"
+ "fmla v5.8h, v22.8h, v1.8h\n"
+ "ldr q13, [x20, x10]\n"
+ "fmla v30.8h, v22.8h, v19.8h\n"
+ "fmla v29.8h, v22.8h, v16.8h\n"
+ "ldr q11, [x16, #0xe0]\n"
+ "ldr x22, [x15, #0x108]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q26, [x21, x10]\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "ldr x21, [x15, #0x110]\n"
+ "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v29.8h, v21.8h, v28.8h\n"
+ "ldr q25, [x16, #0xf0]\n"
+ "fmla v31.8h, v18.8h, v23.8h\n"
+ "ldr q24, [x27, x10]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v30.8h, v18.8h, v28.8h\n"
+ "fmla v29.8h, v18.8h, v26.8h\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v5.8h, v17.8h, v20.8h\n"
+ "fmla v30.8h, v17.8h, v26.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v31.8h, v3.8h, v20.8h\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v5.8h, v3.8h, v19.8h\n"
+ "fmla v30.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v31.8h, v11.8h, v19.8h\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v5.8h, v11.8h, v16.8h\n"
+ "fmla v30.8h, v11.8h, v13.8h\n"
+ "fmla v29.8h, v11.8h, v22.8h\n"
+ "ldr q19, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
- "fmla v28.8h, v0.8h, v9.8h\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.8h, v0.8h, v13.8h\n"
- "fmla v30.8h, v0.8h, v11.8h\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.8h, v0.8h, v12.8h\n"
- "fmla v28.8h, v1.8h, v13.8h\n"
- "fmla v29.8h, v1.8h, v5.8h\n"
- "fmla v30.8h, v1.8h, v12.8h\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.8h, v1.8h, v9.8h\n"
- "fmla v28.8h, v2.8h, v5.8h\n"
- "fmla v29.8h, v2.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v25.8h, v7.8h\n"
+ "ldr q16, [x23, x10]\n"
+ "fmla v5.8h, v25.8h, v28.8h\n"
+ "fmla v30.8h, v25.8h, v18.8h\n"
+ "ldr q18, [x22, x10]\n"
+ "fmla v29.8h, v25.8h, v17.8h\n"
+ "fmla v31.8h, v23.8h, v28.8h\n"
+ "fmla v5.8h, v23.8h, v26.8h\n"
+ "fmla v30.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x21, x10]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v5.8h, v21.8h, v24.8h\n"
+ "fmla v30.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
"add x10, x10, #0x10\n"
- "fmla v28.8h, v3.8h, v6.8h\n"
- "fmla v29.8h, v3.8h, v8.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v31.8h, v3.8h, v12.8h\n"
- "fmla v28.8h, v4.8h, v8.8h\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmla v30.8h, v4.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x14, x28]\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x13, x28]\n"
+ "fmla v31.8h, v20.8h, v24.8h\n"
+ "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v30.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v5.8h, v19.8h, v22.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmla v30.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v5.8h, v5.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v5.8h, v5.8h, v15.8h\n"
+ "str q31, [x14, x28]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q5, [x13, x28]\n"
"str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q29, [x11, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 116f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "mov x28, x10\n"
- "add x14, x14, x28\n"
+ "mov x20, x10\n"
+ "add x14, x14, x20\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "add x11, x11, x28\n"
+ "add x11, x11, x20\n"
"ldr x9, [x15, #0x0]\n"
"ldr x28, [x15, #0x8]\n"
"add x9, x9, x10\n"
@@ -654,12 +654,12 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v10.h }[0], [x21], #0x2\n"
"ld1 { v14.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
- "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
- "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v6.8h\n"
"ldr x20, [x15, #0x50]\n"
"add x20, x20, x10\n"
- "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
- "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
"fmla v30.8h, v1.8h, v8.8h\n"
@@ -1365,14 +1365,14 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v9.h }[0], [x20], #0x2\n"
"111:" // Oddments: Load input (5, 5): Bit 2: End
"fmla v31.8h, v4.8h, v9.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "fmax v29.8h, v29.8h, v18.8h\n"
- "fmax v30.8h, v30.8h, v18.8h\n"
- "fmax v31.8h, v31.8h, v18.8h\n"
- "fmin v28.8h, v28.8h, v17.8h\n"
- "fmin v29.8h, v29.8h, v17.8h\n"
- "fmin v30.8h, v30.8h, v17.8h\n"
- "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 113f\n"
"st1 { v28.d }[0], [x14], #0x8\n"
"st1 { v29.d }[0], [x13], #0x8\n"
@@ -1417,7 +1417,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"116:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
index 1ccd3408e2..b7608af721 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -42,7 +42,7 @@ class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKer
public:
a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {}
- virtual KernelType get_kernel() const override { return kernel; }
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 418530fdc4..08f40b785f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -45,70 +45,70 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v2.8h }, [%x[minmax_vals]]\n"
- "lsr x12, %x[n_channels], #0x3\n"
+ "lsr x9, %x[n_channels], #0x3\n"
"add x20, %x[minmax_vals], #0x2\n"
"ld1r { v1.8h }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 5f\n"
+ "cbz x9, 5f\n"
"1:" // Channel loop
"movi v23.16b, #0x0\n"
"cbz %x[bias], 2f\n"
"ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr q14, [x10, x11]\n"
- "ldr q15, [x9, x11]\n"
+ "mov x26, %x[inptrs]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr q16, [x28, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr q17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr q18, [x26, x11]\n"
- "ldr q19, [x25, x11]\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr q20, [x24, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
- "subs x20, x20, #0x1\n"
+ "ldp x20, x24, [x26], #0x10\n"
+ "ldp x23, x22, [x26], #0x10\n"
+ "subs x25, x25, #0x1\n"
"fmla v23.8h, v14.8h, v0.8h\n"
- "ldr q14, [x10, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q14, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v24.8h, v15.8h, v0.8h\n"
"fmla v25.8h, v16.8h, v0.8h\n"
- "ldr q15, [x9, x11]\n"
- "ldr q16, [x28, x11]\n"
+ "ldr q15, [x24, x11]\n"
+ "ldr q16, [x23, x11]\n"
"fmla v26.8h, v17.8h, v0.8h\n"
"fmla v27.8h, v18.8h, v0.8h\n"
- "ldr q17, [x27, x11]\n"
- "ldr q18, [x26, x11]\n"
+ "ldr q17, [x22, x11]\n"
+ "ldr q18, [x21, x11]\n"
"fmla v28.8h, v19.8h, v0.8h\n"
"fmla v29.8h, v20.8h, v0.8h\n"
- "ldr q19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v30.8h, v21.8h, v0.8h\n"
"fmla v31.8h, v22.8h, v0.8h\n"
"ldr q0, [%x[params], #0x0]\n"
- "ldr q20, [x24, x11]\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla v23.8h, v14.8h, v0.8h\n"
@@ -153,7 +153,7 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"str q30, [x21, x11]\n"
"str q31, [x20, x11]\n"
"add x11, x11, #0x10\n"
- "cmp x11, x12, LSL #4\n"
+ "cmp x11, x9, LSL #4\n"
"blt 1b\n"
"5:" // Oddments
"tst %x[n_channels], #0x7\n"
@@ -183,209 +183,209 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"9:" // Oddments: Load bias: Bit 2: End
"10:" // Oddments: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v31.16b, v23.16b\n"
"add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 12f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #1, 11f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.h }[6], [x10], #0x2\n"
- "ld1 { v15.h }[6], [x9], #0x2\n"
- "ld1 { v16.h }[6], [x28], #0x2\n"
- "ld1 { v17.h }[6], [x27], #0x2\n"
- "ld1 { v18.h }[6], [x26], #0x2\n"
- "ld1 { v19.h }[6], [x25], #0x2\n"
- "ld1 { v20.h }[6], [x24], #0x2\n"
- "ld1 { v21.h }[6], [x23], #0x2\n"
- "ld1 { v22.h }[6], [x22], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v15.h }[6], [x28], #0x2\n"
+ "ld1 { v16.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v19.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v21.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
"b 14f\n"
"11:" // Oddments: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.h }[4], [x10], #0x2\n"
- "ld1 { v15.h }[4], [x9], #0x2\n"
- "ld1 { v16.h }[4], [x28], #0x2\n"
- "ld1 { v17.h }[4], [x27], #0x2\n"
- "ld1 { v18.h }[4], [x26], #0x2\n"
- "ld1 { v19.h }[4], [x25], #0x2\n"
- "ld1 { v20.h }[4], [x24], #0x2\n"
- "ld1 { v21.h }[4], [x23], #0x2\n"
- "ld1 { v22.h }[4], [x22], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v15.h }[4], [x28], #0x2\n"
+ "ld1 { v16.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v19.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v21.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
"b 14f\n"
"12:" // Oddments: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 13f\n"
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.h }[2], [x10], #0x2\n"
- "ld1 { v15.h }[2], [x9], #0x2\n"
- "ld1 { v16.h }[2], [x28], #0x2\n"
- "ld1 { v17.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v19.h }[2], [x25], #0x2\n"
- "ld1 { v20.h }[2], [x24], #0x2\n"
- "ld1 { v21.h }[2], [x23], #0x2\n"
- "ld1 { v22.h }[2], [x22], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v15.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
"b 14f\n"
"13:" // Oddments: Load: Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"14:" // Oddments: Load: Bit 2: End
"subs x20, %x[n_points], #0x1\n"
"ble 20f\n"
"15:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"fmla v23.8h, v14.8h, v0.8h\n"
"fmla v24.8h, v15.8h, v0.8h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"fmla v25.8h, v16.8h, v0.8h\n"
"fmla v26.8h, v17.8h, v0.8h\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr x21, [x10], #0x8\n"
"fmla v27.8h, v18.8h, v0.8h\n"
"fmla v28.8h, v19.8h, v0.8h\n"
- "add x10, x10, x11\n"
+ "add x9, x9, x11\n"
"fmla v29.8h, v20.8h, v0.8h\n"
"fmla v30.8h, v21.8h, v0.8h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"fmla v31.8h, v22.8h, v0.8h\n"
"ldr q0, [%x[params], #0x0]\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v14.h }[6], [x10], #0x2\n"
- "ld1 { v15.h }[6], [x9], #0x2\n"
- "ld1 { v16.h }[6], [x28], #0x2\n"
- "ld1 { v17.h }[6], [x27], #0x2\n"
- "ld1 { v18.h }[6], [x26], #0x2\n"
- "ld1 { v19.h }[6], [x25], #0x2\n"
- "ld1 { v20.h }[6], [x24], #0x2\n"
- "ld1 { v21.h }[6], [x23], #0x2\n"
- "ld1 { v22.h }[6], [x22], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v15.h }[6], [x28], #0x2\n"
+ "ld1 { v16.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v19.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v21.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
"b 19f\n"
"16:" // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v14.h }[4], [x10], #0x2\n"
- "ld1 { v15.h }[4], [x9], #0x2\n"
- "ld1 { v16.h }[4], [x28], #0x2\n"
- "ld1 { v17.h }[4], [x27], #0x2\n"
- "ld1 { v18.h }[4], [x26], #0x2\n"
- "ld1 { v19.h }[4], [x25], #0x2\n"
- "ld1 { v20.h }[4], [x24], #0x2\n"
- "ld1 { v21.h }[4], [x23], #0x2\n"
- "ld1 { v22.h }[4], [x22], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v15.h }[4], [x28], #0x2\n"
+ "ld1 { v16.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v19.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v21.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
"b 19f\n"
"17:" // Oddments: Planar loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v14.h }[2], [x10], #0x2\n"
- "ld1 { v15.h }[2], [x9], #0x2\n"
- "ld1 { v16.h }[2], [x28], #0x2\n"
- "ld1 { v17.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v19.h }[2], [x25], #0x2\n"
- "ld1 { v20.h }[2], [x24], #0x2\n"
- "ld1 { v21.h }[2], [x23], #0x2\n"
- "ld1 { v22.h }[2], [x22], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v15.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
"b 19f\n"
"18:" // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"19:" // Oddments: Planar loop: Load: Bit 2: End
"subs x20, x20, #0x1\n"
"bgt 15b\n"
@@ -507,12 +507,10 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.h }[0], [x21], #0x2\n"
"st1 { v31.h }[0], [x20], #0x2\n"
"24:" // Oddments: Store: Bit 2: End
-
"25:" // End
-
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 8fcbce2cfe..3646c18b04 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index f246cec87e..cee3fb59c5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -44,10 +44,10 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
const __fp16 minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v7.8h }, [%x[minmax_vals]]\n"
+ "ld1r { v8.8h }, [%x[minmax_vals]]\n"
"lsr x11, %x[n_output_channels], #0x3\n"
"add x20, %x[minmax_vals], #0x2\n"
- "ld1r { v6.8h }, [x20]\n"
+ "ld1r { v7.8h }, [x20]\n"
"mov x10, #0x0\n"
"cbz x11, 8f\n"
"1:" // Output channel loop
@@ -56,12 +56,12 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"lsl x20, x10, #0x1\n"
"ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "ldr q5, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q3, [x9, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
"mov v18.16b, v31.16b\n"
@@ -79,26 +79,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 6f\n"
- "ldr q2, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 6f\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q0, [x9, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "ldp x24, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q1, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
@@ -107,332 +127,312 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q3, [x9, #0x0]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "ldr q1, [x24, #0x0]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "ldr q0, [x9, #0x0]\n"
- "ldr q2, [%x[weights], #0x10]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q5, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "lsl x28, x10, #0x1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "lsl x28, x10, #0x1\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
"fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
"fmla v28.8h, v5.8h, v3.h[4]\n"
"fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmin v16.8h, v16.8h, v6.8h\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "str q24, [x20, x28]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "str q25, [x21, x28]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "str q26, [x22, x28]\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
- "ldp x24, x9, [x20], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "ldp x20, x9, [x22], #0x10\n"
"lsl x28, x10, #0x1\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "ldr q3, [x9, #0x0]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "add %x[weights], %x[weights], #0x10\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q1, [%x[weights], #0x0]\n"
+ "ldr q0, [x9, #0x0]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmin v16.8h, v16.8h, v6.8h\n"
+ "add %x[weights], %x[weights], #0x10\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
"fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
"fmla v28.8h, v5.8h, v3.h[4]\n"
"fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "str q24, [x20, x28]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "str q25, [x21, x28]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "str q26, [x22, x28]\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v16.8h, v1.8h, v2.h[0]\n"
+ "fmla v17.8h, v1.8h, v2.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmla v18.8h, v1.8h, v2.h[2]\n"
+ "fmla v19.8h, v1.8h, v2.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmla v20.8h, v1.8h, v2.h[4]\n"
+ "fmla v21.8h, v1.8h, v2.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmla v22.8h, v1.8h, v2.h[6]\n"
+ "fmla v23.8h, v1.8h, v2.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmla v24.8h, v1.8h, v0.h[0]\n"
+ "fmla v25.8h, v1.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmla v26.8h, v1.8h, v0.h[2]\n"
+ "fmla v27.8h, v1.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmla v28.8h, v1.8h, v0.h[4]\n"
+ "fmla v29.8h, v1.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmla v30.8h, v1.8h, v0.h[6]\n"
+ "fmla v31.8h, v1.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmin v16.8h, v16.8h, v6.8h\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
"lsl x28, x10, #0x1\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "str q16, [x20, x28]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q17, [x21, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "str q18, [x22, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q19, [x23, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q20, [x24, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q21, [x25, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q22, [x26, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "str q23, [x27, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "str q24, [x20, x28]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "str q25, [x21, x28]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "str q26, [x22, x28]\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"7:" // Output channel loop: Done
"add x10, x10, #0x8\n"
"cmp x10, x11, LSL #3\n"
@@ -464,12 +464,12 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"ld1 { v31.h }[0], [x20]\n"
"12:" // Output channel oddments: Load bias: Bit 2: End
"13:" // Output channel oddments: Load bias: Done
- "ldr q5, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q3, [x9, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
"mov v18.16b, v31.16b\n"
@@ -487,26 +487,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 17f\n"
- "ldr q2, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 17f\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q0, [x9, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
"beq 15f\n"
"14:" // Output channel oddments: Kernel loop
- "ldp x24, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q1, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
@@ -515,32 +535,28 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q3, [x9, #0x0]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "ldr q1, [x24, #0x0]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "ldr q0, [x9, #0x0]\n"
- "ldr q2, [%x[weights], #0x10]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q5, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 14b\n"
"15:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 16f\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
@@ -557,63 +573,31 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
"b 18f\n"
"16:" // Output channel oddments: Odd tail
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [%x[weights], #0x0]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "ldp x24, x9, [x20], #0x10\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "ldr q4, [x24, #0x0]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
- "ldr q3, [x9, #0x0]\n"
- "ldr q5, [%x[weights], #0x0]\n"
- "fmla v16.8h, v2.8h, v1.h[0]\n"
- "fmla v17.8h, v2.8h, v1.h[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v18.8h, v2.8h, v1.h[2]\n"
- "fmla v19.8h, v2.8h, v1.h[3]\n"
- "fmla v20.8h, v2.8h, v1.h[4]\n"
- "fmla v21.8h, v2.8h, v1.h[5]\n"
- "fmla v22.8h, v2.8h, v1.h[6]\n"
- "fmla v23.8h, v2.8h, v1.h[7]\n"
- "fmla v24.8h, v2.8h, v0.h[0]\n"
- "fmla v25.8h, v2.8h, v0.h[1]\n"
- "fmla v26.8h, v2.8h, v0.h[2]\n"
- "fmla v27.8h, v2.8h, v0.h[3]\n"
- "fmla v28.8h, v2.8h, v0.h[4]\n"
- "fmla v29.8h, v2.8h, v0.h[5]\n"
- "fmla v30.8h, v2.8h, v0.h[6]\n"
- "fmla v31.8h, v2.8h, v0.h[7]\n"
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
@@ -628,415 +612,429 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.8h, v5.8h, v3.h[5]\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v0.8h, v2.h[0]\n"
+ "fmla v17.8h, v0.8h, v2.h[1]\n"
+ "fmla v18.8h, v0.8h, v2.h[2]\n"
+ "fmla v19.8h, v0.8h, v2.h[3]\n"
+ "fmla v20.8h, v0.8h, v2.h[4]\n"
+ "fmla v21.8h, v0.8h, v2.h[5]\n"
+ "fmla v22.8h, v0.8h, v2.h[6]\n"
+ "fmla v23.8h, v0.8h, v2.h[7]\n"
+ "fmla v24.8h, v0.8h, v1.h[0]\n"
+ "fmla v25.8h, v0.8h, v1.h[1]\n"
+ "fmla v26.8h, v0.8h, v1.h[2]\n"
+ "fmla v27.8h, v0.8h, v1.h[3]\n"
+ "fmla v28.8h, v0.8h, v1.h[4]\n"
+ "fmla v29.8h, v0.8h, v1.h[5]\n"
+ "fmla v30.8h, v0.8h, v1.h[6]\n"
+ "fmla v31.8h, v0.8h, v1.h[7]\n"
"b 18f\n"
"17:" // Output channel oddments: Single kernel point
- "fmla v16.8h, v5.8h, v4.h[0]\n"
- "fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmla v18.8h, v5.8h, v4.h[2]\n"
- "fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmla v20.8h, v5.8h, v4.h[4]\n"
- "fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmla v22.8h, v5.8h, v4.h[6]\n"
- "fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmla v24.8h, v5.8h, v3.h[0]\n"
- "fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmla v26.8h, v5.8h, v3.h[2]\n"
- "fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmla v28.8h, v5.8h, v3.h[4]\n"
- "fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmla v30.8h, v5.8h, v3.h[6]\n"
- "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
"18:" // Output channel oddments: Done
- "fmin v16.8h, v16.8h, v6.8h\n"
- "fmin v17.8h, v17.8h, v6.8h\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "fmax v16.8h, v16.8h, v7.8h\n"
- "fmax v17.8h, v17.8h, v7.8h\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
"tbz %x[n_output_channels], #2, 20f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.d }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.d }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.d }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.d }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.d }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.d }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.d }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.d }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
"add x10, x10, #0x4\n"
- "st1 { v24.d }[0], [x20]\n"
- "st1 { v25.d }[0], [x21]\n"
- "st1 { v26.d }[0], [x22]\n"
- "st1 { v27.d }[0], [x23]\n"
- "st1 { v28.d }[0], [x24]\n"
- "st1 { v29.d }[0], [x25]\n"
- "st1 { v30.d }[0], [x26]\n"
- "st1 { v31.d }[0], [x27]\n"
+ "st1 { v24.d }[0], [x27]\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_output_channels], #1, 19f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.s }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.s }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.s }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.s }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.s }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.s }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.s }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.s }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
"add x10, x10, #0x2\n"
- "st1 { v24.s }[2], [x20]\n"
- "st1 { v25.s }[2], [x21]\n"
- "st1 { v26.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x25]\n"
- "st1 { v30.s }[2], [x26]\n"
- "st1 { v31.s }[2], [x27]\n"
+ "st1 { v24.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[6], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[6], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[6], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[6], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[6], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[6], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[6], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[6], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[6], [x20]\n"
- "st1 { v25.h }[6], [x21]\n"
- "st1 { v26.h }[6], [x22]\n"
- "st1 { v27.h }[6], [x23]\n"
- "st1 { v28.h }[6], [x24]\n"
- "st1 { v29.h }[6], [x25]\n"
- "st1 { v30.h }[6], [x26]\n"
- "st1 { v31.h }[6], [x27]\n"
+ "st1 { v17.h }[6], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[6], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[6], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[6], [x27]\n"
+ "st1 { v25.h }[6], [x26]\n"
+ "st1 { v26.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x24]\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 22f\n"
"19:" // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[4], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[4], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[4], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[4], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[4], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[4], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[4], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[4], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[4], [x20]\n"
- "st1 { v25.h }[4], [x21]\n"
- "st1 { v26.h }[4], [x22]\n"
- "st1 { v27.h }[4], [x23]\n"
- "st1 { v28.h }[4], [x24]\n"
- "st1 { v29.h }[4], [x25]\n"
- "st1 { v30.h }[4], [x26]\n"
- "st1 { v31.h }[4], [x27]\n"
+ "st1 { v17.h }[4], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[4], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[4], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[4], [x27]\n"
+ "st1 { v25.h }[4], [x26]\n"
+ "st1 { v26.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x24]\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 22f\n"
"20:" // Output channel oddments: Done: Store: Bit 2: Unset
"tbz %x[n_output_channels], #1, 21f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.s }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.s }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.s }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.s }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.s }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.s }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.s }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.s }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
"add x10, x10, #0x2\n"
- "st1 { v24.s }[0], [x20]\n"
- "st1 { v25.s }[0], [x21]\n"
- "st1 { v26.s }[0], [x22]\n"
- "st1 { v27.s }[0], [x23]\n"
- "st1 { v28.s }[0], [x24]\n"
- "st1 { v29.s }[0], [x25]\n"
- "st1 { v30.s }[0], [x26]\n"
- "st1 { v31.s }[0], [x27]\n"
+ "st1 { v24.s }[0], [x27]\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[2], [x20]\n"
- "st1 { v25.h }[2], [x21]\n"
- "st1 { v26.h }[2], [x22]\n"
- "st1 { v27.h }[2], [x23]\n"
- "st1 { v28.h }[2], [x24]\n"
- "st1 { v29.h }[2], [x25]\n"
- "st1 { v30.h }[2], [x26]\n"
- "st1 { v31.h }[2], [x27]\n"
+ "st1 { v17.h }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[2], [x27]\n"
+ "st1 { v25.h }[2], [x26]\n"
+ "st1 { v26.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x24]\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 22f\n"
"21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x21, x21, x10, LSL #1\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #1\n"
- "add x23, x23, x10, LSL #1\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #1\n"
- "add x25, x25, x10, LSL #1\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #1\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #1\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #1\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #1\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #1\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #1\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #1\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"22:" // Output channel oddments: Done: Store: Bit 2: End
-
"23:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 420e95384d..5d3db974f0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 2ff03aa15a..fd8686c15e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -110,15 +110,15 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"add x11, x15, x15\n"
"ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
"mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x9, x13, x25, LSL #2\n"
"mul x20, x20, x26\n" // offset *= output_tile_size
"add x28, x9, x25, LSL #2\n"
"add x12, x12, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x27, x28, x25, LSL #2\n"
"add x26, x11, x15\n"
"add x25, x12, x24, LSL #2\n"
@@ -126,7 +126,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x10, #0x20]\n"
@@ -145,162 +145,162 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr q13, [x28, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
"add x23, x23, #0x10\n"
"cmp x23, x22, LSL #4\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x27]\n"
- "ldr q16, [x10, #0x0]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ld1 { v18.4s }, [x27]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
"add x13, x13, #0x10\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
"ldr q4, [x10, #0x50]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ld1 { v9.4s }, [x28]\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x28]\n"
"ldr q1, [x10, #0x20]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
"ldr q0, [x10, #0x10]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
"ldr q2, [x10, #0x30]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
"ldr q13, [x28, x15]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x27, x15]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x27, x15]\n"
"ldr q3, [x10, #0x40]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x11]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x11]\n"
"ldr q5, [x10, #0x60]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
"ldr q11, [x13, x26]\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
"ldr q9, [x9, x15]\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
"ld1 { v10.4s }, [x13]\n"
"ldr q6, [x10, #0x70]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
"ldr q12, [x9, x11]\n"
"ldr q7, [x10, #0x80]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
"ldr q8, [x10, #0x90]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"add x27, x27, #0x10\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x12]\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
"add x10, x10, #0xa0\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x12, x14]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.4s }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x27]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x27, x26]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "ldr q12, [x13, x15]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x13, x11]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ld1 { v18.4s }, [x27]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
"add x13, x13, #0x10\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ld1 { v9.4s }, [x28]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x28, x26]\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x28]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x28, x26]\n"
"add x28, x28, #0x10\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x27, x15]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x11]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x27, x15]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
"add x27, x27, #0x10\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x12]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x12, x14]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
- "st1 { v30.4s }, [x25]\n"
- "str q31, [x25, x14]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q21, [x25, x14]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 31f\n"
- "ldr q16, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"ldr q0, [x10, #0x10]\n"
"add x24, x9, x15\n"
"add x23, x13, XZR\n"
@@ -335,11 +335,11 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s12, [x21, #0x0]\n"
"ldr s13, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
"add x20, x27, XZR\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v12.4s\n"
@@ -470,14 +470,14 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
"fmla v30.4s, v8.4s, v12.4s\n"
"fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
+ "fmin v29.4s, v29.4s, v26.4s\n"
+ "fmin v30.4s, v30.4s, v26.4s\n"
+ "fmin v31.4s, v31.4s, v26.4s\n"
"tbz %x[n_channels], #1, 29f\n"
"mov x21, x12\n"
"mov x20, x25\n"
@@ -503,7 +503,6 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"30:" // Tile loop: Oddments: Store: Bit 1: End
-
"31:" // Tile loop: End
"ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -518,11 +517,11 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 56e9ed2e1b..7dedfd972a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -83,16 +83,16 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x15, %x[n_channels], #0x2\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
"sub x27, XZR, x16\n"
"cbz x15, 3f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x16, x15, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -104,197 +104,197 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
"add x14, x14, #0xa0\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "ldr q9, [x26, x28]\n"
- "ldr q10, [x22, x28]\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr q11, [x25, x28]\n"
- "ldr q12, [x24, x28]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x28]\n"
- "bge 2f\n"
- "1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q9, [x22, x28]\n"
- "ldr q16, [x14, #0x0]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
"ldr q12, [x20, x28]\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
- "ldr x24, [x13, #0x58]\n"
- "ldr x23, [x13, #0x60]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x25, x28]\n"
- "ldr x22, [x13, #0x68]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x28]\n"
- "ldr x21, [x13, #0x70]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr q25, [x14, #0x0]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "ldr x22, [x13, #0x58]\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x22, x28]\n"
+ "ldr x26, [x13, #0x70]\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
"ldr q4, [x14, #0x50]\n"
- "ldr x20, [x13, #0x78]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q9, [x23, x28]\n"
+ "ldr x25, [x13, #0x78]\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x21, x28]\n"
"ldr q1, [x14, #0x20]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x22, x28]\n"
- "ldp x26, x22, [x13, #0x0]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "ldp x25, x24, [x13, #0x10]\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr q13, [x23, x16]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x16]\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x26, x28]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x25, x28]\n"
"ldr q3, [x14, #0x40]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
- "ldr q11, [x25, x16]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "ldr q11, [x22, x16]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
- "ldr q9, [x26, x16]\n"
- "ldr q10, [x22, x16]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
- "ldr q12, [x24, x16]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "ldr q9, [x24, x16]\n"
+ "ldr q10, [x23, x16]\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q12, [x21, x16]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"add x16, x16, #0x10\n"
"add x27, x27, #0x10\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
"cmp x16, x15, LSL #4\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
"add x28, x28, #0x10\n"
- "str q28, [x12, x27]\n"
+ "str q24, [x12, x27]\n"
"add x14, x14, #0xa0\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "ldr x22, [x13, #0x28]\n"
- "ldr x21, [x13, #0x30]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q9, [x22, x28]\n"
- "ldr x20, [x13, #0x38]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x21, x28]\n"
- "ldr x22, [x13, #0x48]\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x26, [x13, #0x40]\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "ldr x25, [x13, #0x50]\n"
- "fmla v30.4s, v6.4s, v9.4s\n"
- "ldr q9, [x26, x28]\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr x24, [x13, #0x58]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v29.4s, v6.4s, v13.4s\n"
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
"ldr x23, [x13, #0x60]\n"
"ldr x22, [x13, #0x68]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x25, x28]\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ldr q17, [x21, x28]\n"
"ldr x21, [x13, #0x70]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x28]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0x78]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
"add x27, x27, #0x10\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q9, [x23, x28]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr q10, [x22, x28]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x28]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "fmla v30.4s, v7.4s, v11.4s\n"
- "fmla v31.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x23, x28]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x22, x28]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x21, x28]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
"add x28, x28, #0x10\n"
- "fmla v28.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v8.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x12, x27]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x11, x27]\n"
- "str q30, [x10, x27]\n"
- "str q31, [x9, x27]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "str q24, [x12, x27]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 30f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q25, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "mov x27, x28\n"
- "add x12, x12, x27\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x11, x11, x27\n"
- "add x10, x10, x27\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x9, x9, x27\n"
+ "add x9, x9, x20\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
@@ -329,12 +329,12 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v12.s }[0], [x21], #0x4\n"
"ld1 { v13.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
"ldr x20, [x13, #0x28]\n"
"add x20, x20, x28\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v12.4s\n"
@@ -475,14 +475,14 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"27:" // Oddments: Load input (3, 2): Bit 1: End
"fmla v30.4s, v8.4s, v12.4s\n"
"fmla v31.4s, v7.4s, v12.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
+ "fmin v29.4s, v29.4s, v26.4s\n"
+ "fmin v30.4s, v30.4s, v26.4s\n"
+ "fmin v31.4s, v31.4s, v26.4s\n"
"tbz %x[n_channels], #1, 28f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -503,11 +503,11 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"30:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 0e9a3ba3fc..c2d86615e3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 620319bc7c..9bfcd9cd3c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -110,7 +110,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"lsr x23, %x[n_channels], #0x2\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x13, x16, x25, LSL #2\n"
"mul x20, x20, x26\n" // offset *= output_tile_size
"add x12, x13, x25, LSL #2\n"
@@ -120,9 +120,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"add x9, x11, x8\n"
"add x28, x15, x22, LSL #2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x27, x10, x25, LSL #2\n"
"add x26, x9, x8\n"
"add x25, x28, x22, LSL #2\n"
@@ -130,7 +130,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x24\n"
"cbz x23, 4f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"cmp x24, x23, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
@@ -149,304 +149,304 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr q13, [x13, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"add x24, x24, #0x10\n"
"cmp x24, x23, LSL #4\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "ldr q16, [x14, #0x0]\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ld1 { v12.4s }, [x10]\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "ldr q31, [x14, #0x0]\n"
+ "fmla v29.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v20.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v3.4s, v18.4s\n"
+ "fmla v22.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x13]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ld1 { v18.4s }, [x10]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v29.4s, v8.4s, v23.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v5.4s, v23.4s\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v18.4s\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v19.4s\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v17.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.4s, v8.4s, v17.4s\n"
"add x13, x13, #0x10\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
"add x10, x10, #0x10\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
"add x16, x16, #0x10\n"
"ld1 { v10.4s }, [x16]\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
"ldr q4, [x14, #0x50]\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "ld1 { v12.4s }, [x12]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
"ldr q1, [x14, #0x20]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
"add x12, x12, #0x10\n"
"ldr q9, [x12, x11]\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v20.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v0.4s, v18.4s\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v21.4s, v2.4s, v17.4s\n"
"ldr q2, [x14, #0x30]\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x27, x11]\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v28.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
"ldr q3, [x14, #0x40]\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
"ldr q11, [x16, x26]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
"ldr q8, [x14, #0x90]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
"ldr q7, [x14, #0x80]\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
"ldr q13, [x13, x11]\n"
"ldr q6, [x14, #0x70]\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
"add x27, x27, #0x10\n"
"ld1 { v12.4s }, [x27]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
"add x14, x14, #0xa0\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "st1 { v23.4s }, [x15]\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "str q24, [x15, x17]\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "str q25, [x15, x22]\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "st1 { v28.4s }, [x15]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
"st1 { v26.4s }, [x28]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.4s }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr q10, [x12, x9]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x12, x8]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x16, x8]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x27, x26]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ld1 { v12.4s }, [x10]\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x13, x26]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "ldr q10, [x10, x11]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x26]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x27, x8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x13, x8]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x27, x9]\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x13, x9]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v20.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v3.4s, v18.4s\n"
+ "fmla v22.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x13]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ld1 { v18.4s }, [x10]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v29.4s, v8.4s, v23.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v5.4s, v23.4s\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v18.4s\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v19.4s\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v17.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.4s, v8.4s, v17.4s\n"
"add x13, x13, #0x10\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x10, x9]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x10, x8]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
"add x10, x10, #0x10\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
"add x16, x16, #0x10\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "ld1 { v12.4s }, [x12]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x26]\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmla v20.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v0.4s, v18.4s\n"
"add x12, x12, #0x10\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x27, x11]\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmla v21.4s, v2.4s, v17.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmla v28.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
"add x27, x27, #0x10\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "st1 { v23.4s }, [x15]\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "str q24, [x15, x17]\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "str q25, [x15, x22]\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "st1 { v28.4s }, [x15]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q27, [x15, x22]\n"
"add x15, x15, #0x10\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
"st1 { v26.4s }, [x28]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q27, [x28, x17]\n"
- "str q28, [x28, x22]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
"add x28, x28, #0x10\n"
- "st1 { v29.4s }, [x25]\n"
- "str q30, [x25, x17]\n"
- "str q31, [x25, x22]\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 49f\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
"add x24, x12, x11\n"
"add x23, x16, XZR\n"
@@ -481,18 +481,18 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr s12, [x21, #0x0]\n"
"ldr s13, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
"add x20, x27, x26\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
"fmla v23.4s, v0.4s, v10.4s\n"
"fmla v25.4s, v2.4s, v11.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v29.4s, v6.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v13.4s\n"
"fmla v24.4s, v4.4s, v13.4s\n"
@@ -741,25 +741,25 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
"fmla v29.4s, v8.4s, v13.4s\n"
"fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
"fmla v31.4s, v6.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 47f\n"
"mov x22, x15\n"
"mov x21, x28\n"
@@ -804,7 +804,6 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"st1 { v28.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"48:" // Tile loop: Oddments: Store: Bit 1: End
-
"49:" // Tile loop: End
"ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -819,11 +818,11 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 15053a337a..972f7eb535 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -87,405 +87,405 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x2\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x7, #0x10\n" // cntb _, ALL, #1
+ "lsr x8, %x[n_channels], #0x2\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr q13, [x27, x13]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "sub x13, XZR, x7\n"
+ "cbz x8, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x7, x8, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr q10, [x20, x14]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x15, #0x30]\n"
+ "ldr x23, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v6.4s, v17.4s\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "ldr q16, [x15, #0x0]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0x68]\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "ldr x25, [x14, #0x78]\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "ldr x11, [x14, #0x80]\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
- "ldr x27, [x14, #0x20]\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x23, x14]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x23, x12]\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "ldr q5, [x15, #0x60]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
- "ldr q8, [x15, #0x90]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "ldr q7, [x15, #0x80]\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
- "ldr q13, [x27, x8]\n"
- "ldr q6, [x15, #0x70]\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "ldr x23, [x16, #0x20]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x8]\n"
- "ldr q10, [x10, x8]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x8]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "ldr q12, [x28, x8]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q24, [x22, x12]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "str q25, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "str q26, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x8, x8, #0x10\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "cmp x8, x17, LSL #4\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "add x13, x13, #0x10\n"
- "str q28, [x22, x12]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x21, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x22, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "ldr q31, [x16, #0x0]\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ldr x23, [x15, #0x68]\n"
+ "fmla v28.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla v26.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr x21, [x15, #0x80]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v26.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v22.4s, v1.4s, v19.4s\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v23.4s, v3.4s, v16.4s\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v29.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v28.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.4s, v5.4s, v17.4s\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v29.4s, v2.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v22.4s, v4.4s, v16.4s\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr q17, [x21, x14]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.4s, v0.4s, v18.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v22.4s, v2.4s, v17.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v22.4s, v6.4s, v16.4s\n"
+ "ldr q13, [x20, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x7]\n"
+ "ldr q10, [x20, x7]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x7]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "ldr q12, [x20, x7]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q28, [x9, x13]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q27, [x28, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "str q26, [x27, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x7, x7, #0x10\n"
+ "str q25, [x24, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "cmp x7, x8, LSL #4\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "add x14, x14, #0x10\n"
+ "str q24, [x23, x13]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q23, [x22, x13]\n"
+ "add x16, x16, #0xa0\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "ldr x26, [x14, #0x30]\n"
- "ldr x25, [x14, #0x38]\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr x24, [x14, #0x28]\n"
- "ldr x10, [x14, #0x48]\n"
- "ldr q10, [x10, x13]\n"
- "fmla v24.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "ldr x26, [x14, #0x70]\n"
- "ldr x10, [x14, #0x88]\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "add x12, x12, #0x10\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x23, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v6.4s, v17.4s\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
"fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v13.4s\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "fmla v28.4s, v0.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0x68]\n"
- "ldr x25, [x14, #0x78]\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v26.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
- "ldr x20, [x16, #0x18]\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "fmla v23.4s, v1.4s, v13.4s\n"
- "ldr q13, [x28, x13]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "ldr x27, [x14, #0xa0]\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v26.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v24.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x24, [x14, #0xa8]\n"
- "fmla v26.4s, v6.4s, v12.4s\n"
- "ldr q12, [x11, x13]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v25.4s, v5.4s, v13.4s\n"
- "ldr q13, [x25, x13]\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "ldr x26, [x14, #0xb0]\n"
- "ldr x25, [x14, #0xb8]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v27.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x9, x13]\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v24.4s, v5.4s, v11.4s\n"
- "fmla v25.4s, v4.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.4s, v8.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr q13, [x24, x13]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v12.4s\n"
- "fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
+ "ldr q18, [x22, x14]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "fmla v28.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v5.4s, v13.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x23, x12]\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr x23, [x16, #0x20]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q13, [x11, x13]\n"
- "fmla v26.4s, v3.4s, v12.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "str q24, [x22, x12]\n"
- "fmla v29.4s, v8.4s, v13.4s\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "ldr x22, [x16, #0x28]\n"
- "fmla v31.4s, v6.4s, v13.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "str q25, [x21, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "str q26, [x20, x12]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q27, [x23, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "add x13, x13, #0x10\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x22, x12]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x21, x12]\n"
- "str q30, [x20, x12]\n"
- "str q31, [x23, x12]\n"
+ "ldr q16, [x21, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v28.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v26.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla v26.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v22.4s, v1.4s, v19.4s\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v23.4s, v3.4s, v16.4s\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "ldr x22, [x15, #0xc0]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v29.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v28.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v28.4s, v5.4s, v17.4s\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v29.4s, v2.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v22.4s, v4.4s, v16.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v2.4s, v17.4s\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q28, [x9, x13]\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x23, [x17, #0x28]\n"
+ "fmla v22.4s, v6.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q27, [x28, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q26, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q25, [x20, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q24, [x23, x13]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q23, [x22, x13]\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 48f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x24, [x14, #0x0]\n"
- "ldr x23, [x14, #0x8]\n"
- "add x24, x24, x13\n"
- "add x23, x23, x13\n"
- "ldr x22, [x14, #0x10]\n"
- "ldr x21, [x14, #0x18]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x13, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "ldr x24, [x15, #0x0]\n"
+ "ldr x23, [x15, #0x8]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "ldr x22, [x15, #0x10]\n"
+ "ldr x21, [x15, #0x18]\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
"ld1 { v10.d }[0], [x23], #0x8\n"
@@ -506,19 +506,19 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ld1 { v12.s }[0], [x21], #0x4\n"
"ld1 { v13.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
- "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "ldr x20, [x14, #0x28]\n"
- "add x20, x20, x13\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "add x20, x20, x14\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
"fmla v23.4s, v0.4s, v10.4s\n"
"fmla v25.4s, v2.4s, v11.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v29.4s, v6.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v13.4s\n"
"fmla v24.4s, v4.4s, v13.4s\n"
@@ -534,9 +534,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"6:" // Oddments: Load input (4, 4): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (4, 4): Bit 1: End
- "ldr x20, [x14, #0x30]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla v31.4s, v8.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -545,10 +545,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (2, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (2, 1): Bit 1: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
"fmla v23.4s, v7.4s, v11.4s\n"
"fmla v24.4s, v6.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v11.4s\n"
@@ -561,10 +561,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (0, 1): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (0, 1): Bit 1: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x15, #0x40]\n"
"fmla v23.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -573,10 +573,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"12:" // Oddments: Load input (0, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 3): Bit 1: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v24.4s, v2.4s, v12.4s\n"
"fmla v25.4s, v1.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -585,10 +585,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (2, 3): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 3): Bit 1: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla v24.4s, v8.4s, v10.4s\n"
"fmla v25.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.4s, v5.4s, v10.4s\n"
"fmla v28.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
@@ -601,10 +601,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"16:" // Oddments: Load input (1, 0): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (1, 0): Bit 1: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v23.4s, v3.4s, v11.4s\n"
"fmla v26.4s, v0.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
@@ -613,10 +613,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (1, 4): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 4): Bit 1: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v25.4s, v5.4s, v13.4s\n"
"fmla v28.4s, v2.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -625,10 +625,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"20:" // Oddments: Load input (3, 0): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (3, 0): Bit 1: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v26.4s, v6.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -637,10 +637,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (3, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (3, 2): Bit 1: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v28.4s, v6.4s, v10.4s\n"
"fmla v29.4s, v5.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v10.4s\n"
@@ -653,10 +653,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (3, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 4): Bit 1: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v5.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 26f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
@@ -665,10 +665,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (4, 1): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 1): Bit 1: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v29.4s, v7.4s, v13.4s\n"
"fmla v30.4s, v6.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 28f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
@@ -677,10 +677,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"28:" // Oddments: Load input (1, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (1, 1): Bit 1: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v24.4s, v3.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v26.4s, v1.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 30f\n"
@@ -691,10 +691,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (1, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v24.4s, v5.4s, v11.4s\n"
"fmla v25.4s, v4.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v27.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -705,10 +705,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"32:" // Oddments: Load input (4, 3): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (4, 3): Bit 1: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.4s, v8.4s, v13.4s\n"
"fmla v31.4s, v7.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 34f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
@@ -717,10 +717,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (3, 1): Bit 1: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -731,10 +731,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"36:" // Oddments: Load input (0, 2): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (0, 2): Bit 1: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v23.4s, v2.4s, v11.4s\n"
"fmla v24.4s, v1.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v25.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 38f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -744,10 +744,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (3, 3): Bit 1: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v27.4s, v8.4s, v13.4s\n"
"fmla v28.4s, v7.4s, v13.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v30.4s, v5.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -758,10 +758,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"40:" // Oddments: Load input (2, 0): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (2, 0): Bit 1: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
"fmla v26.4s, v3.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v29.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 42f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -771,10 +771,10 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (2, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (2, 4): Bit 1: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x14\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
@@ -786,120 +786,120 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"45:" // Oddments: Load input (4, 2): Bit 1: End
"fmla v29.4s, v8.4s, v13.4s\n"
"fmla v30.4s, v7.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
"fmla v31.4s, v6.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 46f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.d }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.d }[0], [x22]\n"
- "st1 { v25.d }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "add x12, x12, #0x8\n"
- "st1 { v28.d }[0], [x22]\n"
- "st1 { v29.d }[0], [x21]\n"
- "st1 { v30.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 47f\n"
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[2], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[2], [x22]\n"
- "st1 { v25.s }[2], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.s }[2], [x22]\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Store: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "add x23, x23, x12\n"
- "st1 { v23.s }[0], [x23]\n"
- "ldr x22, [x16, #0x8]\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x22, x22, x12\n"
- "add x21, x21, x12\n"
- "ldr x23, [x16, #0x20]\n"
- "add x20, x20, x12\n"
- "add x23, x23, x12\n"
- "st1 { v24.s }[0], [x22]\n"
- "st1 { v25.s }[0], [x21]\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x22, x22, x12\n"
- "st1 { v26.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
- "st1 { v27.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
- "st1 { v28.s }[0], [x22]\n"
- "st1 { v29.s }[0], [x21]\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x23]\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"47:" // Oddments: Store: Bit 1: End
"48:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 6c897d6eaa..8a198c1818 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 6d2b6ee998..3adf8b0d9f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -124,9 +124,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x22, LSL #2\n"
"add x23, x5, x5\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v13.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x26, x9, x24, LSL #2\n"
"add x25, x28, x4\n"
"add x24, x27, x22, LSL #2\n"
@@ -134,7 +134,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x6\n"
"cbz x13, 4f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x6, x13, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -152,499 +152,499 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr q12, [x14, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"add x6, x6, #0x10\n"
"cmp x6, x13, LSL #4\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v3.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n fmla v22.4s, v1.4s, v9.4s\n"
"add x20, x20, #0x10\n"
"add x21, x21, #0x10\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v7.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v6.4s, v9.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v5.4s, v9.4s\n"
+ "mov v20.16b, v14.16b\n fmla v20.4s, v2.4s, v9.4s\n"
"ldr q9, [x12, x17]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ld1 { v30.4s }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q27, [x26, x25]\n"
+ "fmla v16.4s, v4.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v23.4s, v1.4s, v12.4s\n"
+ "mov v21.16b, v14.16b\n fmla v21.4s, v6.4s, v30.4s\n"
"ldr q10, [x12, x11]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v8.4s, v12.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "mov v24.16b, v14.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x7, x4]\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v8.4s, v27.4s\n"
+ "ldr q12, [x7, x28]\n"
+ "fmla v16.4s, v6.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "ldr q14, [x16, #0x0]\n"
+ "fmla v31.4s, v8.4s, v9.4s\n"
+ "fmla v20.4s, v5.4s, v9.4s\n"
+ "fmla v21.4s, v2.4s, v9.4s\n"
"ld1 { v9.4s }, [x15]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x15, x25]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x9]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v23.4s, v4.4s, v10.4s\n"
+ "fmla v19.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v18.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v0.4s, v10.4s\n"
"ldr q10, [x15, x17]\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v20.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x9, x25]\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x15, x11]\n"
+ "fmla v25.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q9, [x26, x4]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
"ldr q10, [x14, x4]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x14, x28]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v27.4s, v6.4s, v9.4s\n"
+ "ldr q12, [x26, x28]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v7.4s, v10.4s\n"
+ "fmla v25.4s, v6.4s, v10.4s\n"
"ldr q10, [x7, x17]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x12, x4]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr q9, [x12, x4]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
"ldr q12, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
+ "fmla v31.4s, v7.4s, v9.4s\n"
+ "fmla v26.4s, v6.4s, v9.4s\n"
+ "fmla v20.4s, v4.4s, v9.4s\n"
+ "fmla v22.4s, v3.4s, v9.4s\n"
+ "fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x12, x28]\n"
+ "fmla v28.4s, v2.4s, v10.4s\n"
+ "fmla v25.4s, v1.4s, v10.4s\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x14]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v7.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v19.4s, v4.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
"ldr q11, [x9, x17]\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v17.4s, v1.4s, v12.4s\n"
"ldr q12, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
"ld1 { v10.4s }, [x12]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v18.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v2.4s, v12.4s\n"
"ldr q12, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
"ldr q10, [x26, x17]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v18.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
"ldr q11, [x9, x11]\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
"ldr q12, [x26, x11]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
"ldr q10, [x15, x4]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
"add x26, x26, #0x10\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
+ "fmla v19.4s, v6.4s, v11.4s\n"
"ldr q11, [x15, x28]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v27.4s, v8.4s, v12.4s\n"
"add x15, x15, #0x10\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v30.4s, v6.4s, v12.4s\n"
"ldr q12, [x9, x4]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v25.4s, v3.4s, v10.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
"ldr q10, [x9, x28]\n"
- "ldr q9, [x14, x17]\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v17.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
"add x9, x9, #0x10\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "ldr q13, [x16, #0x0]\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
+ "fmla v16.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
"ldr q11, [x7, x25]\n"
- "ldr q0, [x16, #0x10]\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v21.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v3.4s, v12.4s\n"
"ldr q12, [x14, x11]\n"
- "ldr q2, [x16, #0x30]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr q6, [x16, #0x70]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v23.4s, v8.4s, v10.4s\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v19.4s, v7.4s, v10.4s\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v18.4s, v5.4s, v10.4s\n"
"ldr q5, [x16, #0x60]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
"ld1 { v10.4s }, [x7]\n"
"ldr q4, [x16, #0x50]\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "st1 { v16.4s }, [x8]\n"
- "ldr q7, [x16, #0x80]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q17, [x8, x5]\n"
- "ldr q8, [x16, #0x90]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q18, [x8, x23]\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
"add x16, x16, #0xa0\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q19, [x8, x22]\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "st1 { v28.4s }, [x8]\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q25, [x8, x5]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "str q17, [x8, x23]\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "st1 { v20.4s }, [x10]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q21, [x10, x5]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q22, [x10, x23]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q23, [x10, x22]\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "st1 { v31.4s }, [x10]\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q26, [x10, x5]\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "str q16, [x10, x23]\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "str q24, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.4s }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v20.4s }, [x27]\n"
+ "str q22, [x27, x5]\n"
+ "str q23, [x27, x23]\n"
+ "str q19, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.4s }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v21.4s }, [x24]\n"
+ "str q27, [x24, x5]\n"
+ "str q18, [x24, x23]\n"
+ "str q30, [x24, x22]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x17]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x26]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x25]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "ldr q10, [x12, x11]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v3.4s, v9.4s\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x12, x17]\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ld1 { v21.4s }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q20, [x26, x25]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v6.4s, v21.4s\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v16.4s, v7.4s, v24.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x7, x4]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x7, x28]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x15, x25]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x9, x25]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x15, x11]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x15]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "ldr q10, [x15, x17]\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "ldr q11, [x26, x4]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
- "ldr q10, [x14, x4]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x26, x28]\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "ldr q10, [x7, x17]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x12, x4]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
"fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
- "ldr q12, [x7, x11]\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "mov v11.16b, v14.16b\n fmla v11.4s, v3.4s, v12.4s\n"
+ "mov v10.16b, v14.16b\n fmla v10.4s, v0.4s, v12.4s\n"
+ "ldr q22, [x7, x4]\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v8.4s, v20.4s\n"
+ "ldr q21, [x7, x28]\n"
+ "fmla v31.4s, v6.4s, v24.4s\n"
+ "fmla v30.4s, v4.4s, v24.4s\n"
+ "fmla v18.4s, v3.4s, v24.4s\n"
+ "mov v12.16b, v14.16b\n fmla v12.4s, v1.4s, v24.4s\n"
+ "fmla v14.4s, v0.4s, v24.4s\n"
+ "fmla v28.4s, v8.4s, v24.4s\n"
+ "fmla v27.4s, v5.4s, v24.4s\n"
+ "fmla v26.4s, v2.4s, v24.4s\n"
+ "ld1 { v24.4s }, [x15]\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v17.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x15, x25]\n"
+ "fmla v19.4s, v2.4s, v21.4s\n"
+ "fmla v29.4s, v1.4s, v21.4s\n"
+ "ld1 { v20.4s }, [x9]\n"
+ "fmla v31.4s, v7.4s, v9.4s\n"
+ "fmla v11.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v5.4s, v9.4s\n"
+ "fmla v18.4s, v4.4s, v9.4s\n"
+ "fmla v10.4s, v3.4s, v9.4s\n"
+ "fmla v12.4s, v2.4s, v9.4s\n"
+ "fmla v14.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v0.4s, v9.4s\n"
+ "ldr q21, [x15, x17]\n"
+ "fmla v28.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v20.4s\n"
+ "fmla v26.4s, v3.4s, v20.4s\n"
+ "ldr q20, [x9, x25]\n"
+ "fmla v16.4s, v1.4s, v21.4s\n"
+ "fmla v23.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x15, x11]\n"
+ "fmla v17.4s, v4.4s, v21.4s\n"
+ "fmla v19.4s, v3.4s, v21.4s\n"
+ "fmla v31.4s, v0.4s, v21.4s\n"
+ "fmla v10.4s, v8.4s, v20.4s\n"
+ "fmla v25.4s, v5.4s, v20.4s\n"
+ "ldr q20, [x26, x4]\n"
+ "fmla v28.4s, v2.4s, v21.4s\n"
+ "fmla v16.4s, v2.4s, v22.4s\n"
+ "fmla v23.4s, v5.4s, v21.4s\n"
+ "ldr q21, [x14, x4]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v11.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x14, x28]\n"
+ "fmla v26.4s, v7.4s, v20.4s\n"
+ "fmla v12.4s, v6.4s, v20.4s\n"
+ "ldr q20, [x26, x28]\n"
+ "fmla v28.4s, v4.4s, v21.4s\n"
+ "fmla v16.4s, v3.4s, v21.4s\n"
+ "fmla v27.4s, v1.4s, v21.4s\n"
+ "fmla v30.4s, v0.4s, v21.4s\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "fmla v17.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x7, x17]\n"
+ "fmla v14.4s, v8.4s, v20.4s\n"
+ "fmla v25.4s, v7.4s, v20.4s\n"
+ "ldr q20, [x12, x4]\n"
+ "fmla v19.4s, v8.4s, v22.4s\n"
+ "fmla v29.4s, v7.4s, v22.4s\n"
+ "fmla v31.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v4.4s, v22.4s\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
+ "fmla v10.4s, v1.4s, v22.4s\n"
+ "ldr q22, [x7, x11]\n"
"add x7, x7, #0x10\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x28]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x14]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x9, x17]\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "ldr q12, [x14, x25]\n"
+ "fmla v28.4s, v7.4s, v20.4s\n"
+ "fmla v16.4s, v6.4s, v20.4s\n"
+ "fmla v27.4s, v4.4s, v20.4s\n"
+ "fmla v30.4s, v3.4s, v20.4s\n"
+ "fmla v26.4s, v1.4s, v20.4s\n"
+ "fmla v12.4s, v0.4s, v20.4s\n"
+ "ldr q20, [x12, x28]\n"
+ "fmla v23.4s, v2.4s, v21.4s\n"
+ "fmla v17.4s, v1.4s, v21.4s\n"
+ "fmla v19.4s, v0.4s, v21.4s\n"
+ "ld1 { v21.4s }, [x14]\n"
+ "fmla v14.4s, v2.4s, v20.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "fmla v27.4s, v0.4s, v21.4s\n"
+ "fmla v31.4s, v8.4s, v20.4s\n"
+ "fmla v11.4s, v7.4s, v20.4s\n"
+ "fmla v18.4s, v5.4s, v20.4s\n"
+ "fmla v10.4s, v4.4s, v20.4s\n"
+ "fmla v25.4s, v1.4s, v20.4s\n"
+ "ldr q24, [x9, x17]\n"
+ "fmla v17.4s, v2.4s, v22.4s\n"
+ "fmla v19.4s, v1.4s, v22.4s\n"
+ "ldr q20, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
- "ld1 { v10.4s }, [x12]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
- "ldr q12, [x12, x25]\n"
+ "fmla v23.4s, v6.4s, v21.4s\n"
+ "ld1 { v21.4s }, [x12]\n"
+ "fmla v12.4s, v4.4s, v24.4s\n"
+ "fmla v14.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v8.4s, v20.4s\n"
+ "fmla v11.4s, v5.4s, v20.4s\n"
+ "fmla v10.4s, v2.4s, v20.4s\n"
+ "ldr q20, [x12, x25]\n"
"add x12, x12, #0x10\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x17]\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "ldr q11, [x9, x11]\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x26, x11]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr q10, [x15, x4]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v21.4s\n"
+ "fmla v27.4s, v3.4s, v21.4s\n"
+ "fmla v26.4s, v0.4s, v21.4s\n"
+ "ldr q22, [x26, x17]\n"
+ "fmla v25.4s, v2.4s, v20.4s\n"
+ "fmla v12.4s, v7.4s, v22.4s\n"
+ "fmla v14.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v8.4s, v24.4s\n"
+ "fmla v30.4s, v7.4s, v24.4s\n"
+ "fmla v18.4s, v6.4s, v24.4s\n"
+ "fmla v26.4s, v5.4s, v24.4s\n"
+ "ldr q21, [x9, x11]\n"
+ "fmla v10.4s, v5.4s, v20.4s\n"
+ "fmla v12.4s, v5.4s, v21.4s\n"
+ "fmla v14.4s, v4.4s, v21.4s\n"
+ "fmla v25.4s, v3.4s, v21.4s\n"
+ "fmla v11.4s, v8.4s, v20.4s\n"
+ "ldr q20, [x26, x11]\n"
+ "fmla v26.4s, v8.4s, v22.4s\n"
+ "ldr q9, [x15, x4]\n"
+ "fmla v30.4s, v8.4s, v21.4s\n"
+ "fmla v18.4s, v7.4s, v21.4s\n"
"add x26, x26, #0x10\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
- "ldr q11, [x15, x28]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v10.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x15, x28]\n"
+ "fmla v12.4s, v8.4s, v20.4s\n"
"add x15, x15, #0x10\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x4]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x28]\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmla v14.4s, v7.4s, v20.4s\n"
+ "fmla v25.4s, v6.4s, v20.4s\n"
+ "ldr q24, [x9, x4]\n"
+ "fmla v23.4s, v4.4s, v9.4s\n"
+ "fmla v17.4s, v3.4s, v9.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v9.4s\n"
+ "ldr q0, [x9, x28]\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v19.4s, v5.4s, v21.4s\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
"add x9, x9, #0x10\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "st1 { v16.4s }, [x8]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmla v31.4s, v2.4s, v21.4s\n"
+ "fmla v11.4s, v1.4s, v21.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmla v27.4s, v7.4s, v24.4s\n"
+ "fmla v30.4s, v6.4s, v24.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v26.4s, v4.4s, v24.4s\n"
+ "fmla v12.4s, v3.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v18.4s, v8.4s, v0.4s\n"
+ "fmla v10.4s, v7.4s, v0.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmla v14.4s, v5.4s, v0.4s\n"
+ "fmla v25.4s, v4.4s, v0.4s\n"
+ "fmax v11.4s, v11.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v12.4s, v12.4s, v13.4s\n"
+ "fmax v14.4s, v14.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "st1 { v23.4s }, [x8]\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
"str q17, [x8, x5]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q18, [x8, x23]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q19, [x8, x22]\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "str q19, [x8, x23]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v11.4s, v11.4s, v15.4s\n"
+ "str q29, [x8, x22]\n"
"add x8, x8, #0x10\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "st1 { v20.4s }, [x10]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q21, [x10, x5]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q22, [x10, x23]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q23, [x10, x22]\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "st1 { v28.4s }, [x10]\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v10.4s, v10.4s, v15.4s\n"
+ "str q16, [x10, x5]\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v12.4s, v12.4s, v15.4s\n"
+ "str q31, [x10, x23]\n"
+ "fmin v14.4s, v14.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "str q11, [x10, x22]\n"
"add x10, x10, #0x10\n"
- "st1 { v24.4s }, [x27]\n"
- "str q25, [x27, x5]\n"
- "str q26, [x27, x23]\n"
- "str q27, [x27, x22]\n"
+ "st1 { v27.4s }, [x27]\n"
+ "str q30, [x27, x5]\n"
+ "str q18, [x27, x23]\n"
+ "str q10, [x27, x22]\n"
"add x27, x27, #0x10\n"
- "st1 { v28.4s }, [x24]\n"
- "str q29, [x24, x5]\n"
- "str q30, [x24, x23]\n"
- "str q31, [x24, x22]\n"
+ "st1 { v26.4s }, [x24]\n"
+ "str q12, [x24, x5]\n"
+ "str q14, [x24, x23]\n"
+ "str q25, [x24, x22]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 73f\n"
- "ldr q13, [x16, #0x0]\n"
+ "ldr q14, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"add x23, x14, x17\n"
"add x22, x7, XZR\n"
@@ -675,27 +675,27 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr s11, [x21, #0x0]\n"
"ldr s12, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
"add x20, x26, XZR\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v14.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v14.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v14.16b\n fmla v24.4s, v2.4s, v9.4s\n"
"fmla v16.4s, v0.4s, v10.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
"fmla v18.4s, v7.4s, v12.4s\n"
"fmla v19.4s, v6.4s, v12.4s\n"
"fmla v21.4s, v5.4s, v12.4s\n"
"fmla v22.4s, v4.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v3.4s, v12.4s\n"
"fmla v25.4s, v2.4s, v12.4s\n"
"fmla v26.4s, v1.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 7f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
@@ -704,7 +704,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
"ldr s10, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v6.4s, v10.4s\n"
"add x20, x26, x25\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d11, [x20], #0x8\n"
@@ -714,7 +714,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
"ldr s11, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v8.4s, v11.4s\n"
"add x20, x12, x17\n"
"tbz %x[n_channels], #1, 11f\n"
"ldr d9, [x20], #0x8\n"
@@ -732,8 +732,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.4s, v4.4s, v9.4s\n"
"fmla v26.4s, v3.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 13f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
@@ -1105,40 +1105,40 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 71f\n"
"mov x23, x8\n"
"mov x22, x10\n"
@@ -1229,4 +1229,4 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 2353045021..76045f30d6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -98,629 +98,629 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x8, #0x10\n" // cntb _, ALL, #1
- "lsr x17, %x[n_channels], #0x2\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "lsr x7, %x[n_channels], #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v13.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v14.4s }, [x20]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x13, #0x0\n"
- "sub x12, XZR, x8\n"
- "cbz x17, 3f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x8, x17, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "ldr q9, [x11, x13]\n"
- "ldr q10, [x10, x13]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "ldr q11, [x9, x13]\n"
- "ldr q12, [x28, x13]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "sub x14, XZR, x6\n"
+ "cbz x7, 3f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x6, x7, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr q10, [x20, x15]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x15]\n"
+ "ldr q12, [x20, x15]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "add x12, x12, #0x10\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
+ "mov v23.16b, v30.16b\n fmla v23.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "mov v15.16b, v30.16b\n fmla v15.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x23, x15]\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "fmla v16.4s, v8.4s, v12.4s\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v15.4s, v7.4s, v12.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v23.4s, v7.4s, v9.4s\n"
+ "fmla v10.4s, v6.4s, v12.4s\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v3.4s, v12.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v18.4s\n"
+ "ldr q12, [x26, x15]\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmla v20.4s, v3.4s, v9.4s\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "ldr q30, [x17, #0x0]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x25, x15]\n"
+ "fmla v17.4s, v1.4s, v11.4s\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v16.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x21, x15]\n"
+ "fmla v15.4s, v2.4s, v12.4s\n"
+ "ldr x21, [x16, #0x98]\n"
+ "fmla v23.4s, v8.4s, v22.4s\n"
+ "fmla v10.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.4s, v7.4s, v22.4s\n"
+ "fmla v21.4s, v6.4s, v22.4s\n"
+ "fmla v28.4s, v5.4s, v22.4s\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v19.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v22.4s\n"
+ "fmla v18.4s, v1.4s, v22.4s\n"
+ "fmla v24.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.4s, v3.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr q9, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v16.4s, v4.4s, v22.4s\n"
+ "fmla v15.4s, v3.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v10.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v2.4s, v11.4s\n"
+ "ldr q12, [x22, x15]\n"
+ "fmla v25.4s, v0.4s, v22.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.4s, v8.4s, v9.4s\n"
"fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
+ "ldr q11, [x20, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v27.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "fmla v15.4s, v4.4s, v12.4s\n"
"fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
+ "fmla v10.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v21.4s, v0.4s, v12.4s\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
"fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "ldp x11, x10, [x14, #0x0]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x13]\n"
- "ldr q9, [x11, x8]\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "ldr q13, [x15, #0x0]\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q1, [x15, #0x20]\n"
+ "ldr q12, [x27, x15]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v16.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v4.4s, v22.4s\n"
+ "fmla v23.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v28.4s, v0.4s, v22.4s\n"
+ "ldr q11, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v15.4s, v8.4s, v9.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
"fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "ldr q6, [x15, #0x70]\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "ldr q7, [x15, #0x80]\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
+ "ldr q12, [x25, x15]\n"
+ "fmla v19.4s, v1.4s, v9.4s\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v10.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v5.4s, v9.4s\n"
+ "fmla v21.4s, v4.4s, v9.4s\n"
+ "fmla v20.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v15.4s, v0.4s, v11.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v27.4s, v7.4s, v12.4s\n"
+ "ldr x25, [x16, #0xf8]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v26.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v2.4s, v9.4s\n"
+ "fmla v15.4s, v1.4s, v9.4s\n"
+ "fmla v10.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x20, x15]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "ldr x22, [x16, #0x110]\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q12, [x28, x15]\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "ldr x21, [x16, #0x118]\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v26.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "fmla v10.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v9.4s\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v27.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v20.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v22.4s\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x24, x15]\n"
+ "fmla v29.4s, v8.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x20, x15]\n"
+ "ldp x20, x24, [x16, #0x0]\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v21.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v17.4s, v4.4s, v22.4s\n"
+ "fmla v16.4s, v3.4s, v22.4s\n"
+ "fmla v15.4s, v5.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v10.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v18.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "fmax v15.4s, v15.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v23.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmla v20.4s, v8.4s, v22.4s\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v19.4s, v7.4s, v22.4s\n"
+ "ldr q7, [x17, #0x80]\n"
"fmin v17.4s, v17.4s, v14.4s\n"
- "str q16, [x23, x12]\n"
- "ldr q8, [x15, #0x90]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q17, [x22, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "str q18, [x21, x12]\n"
- "ldr x22, [x16, #0x28]\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "str q19, [x20, x12]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q10, [x10, x8]\n"
- "ldr q4, [x15, #0x50]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q20, [x23, x12]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q17, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmin v15.4s, v15.4s, v14.4s\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "str q16, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "str q15, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "str q10, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v26.4s, v3.4s, v11.4s\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmla v18.4s, v5.4s, v22.4s\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v24.4s, v4.4s, v22.4s\n"
+ "ldr q10, [x24, x6]\n"
+ "ldr q4, [x17, #0x50]\n"
"fmin v23.4s, v23.4s, v14.4s\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "ldp x9, x28, [x14, #0x10]\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
"fmin v25.4s, v25.4s, v14.4s\n"
- "ldr q11, [x9, x8]\n"
- "ldr q12, [x28, x8]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "str q24, [x23, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "str q25, [x22, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "str q26, [x21, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x8, x8, #0x10\n"
- "cmp x8, x17, LSL #4\n"
- "str q27, [x20, x12]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q27, [x23, x14]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "str q23, [x22, x14]\n"
+ "ldr x25, [x8, #0x40]\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "str q25, [x21, x14]\n"
+ "ldr x23, [x8, #0x48]\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "str q21, [x20, x14]\n"
+ "ldr x22, [x8, #0x50]\n"
+ "ldr x24, [x8, #0x58]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x6]\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"fmin v28.4s, v28.4s, v14.4s\n"
+ "ldr q12, [x20, x6]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q31, [x25, x14]\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "str q28, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "str q20, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "str q19, [x24, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x7, LSL #4\n"
"fmin v29.4s, v29.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "add x13, x13, #0x10\n"
- "str q28, [x23, x12]\n"
- "str q29, [x22, x12]\n"
- "add x15, x15, #0xa0\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q29, [x23, x14]\n"
+ "add x17, x17, #0xa0\n"
+ "str q26, [x22, x14]\n"
+ "str q18, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "ldr x27, [x14, #0x20]\n"
- "ldr x26, [x14, #0x30]\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "ldr x25, [x14, #0x28]\n"
- "ldr x24, [x14, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "ldr x11, [x14, #0x40]\n"
- "ldr x10, [x14, #0x48]\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v12.4s\n"
- "ldr x9, [x14, #0x50]\n"
- "ldr x28, [x14, #0x58]\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
- "ldr q9, [x26, x13]\n"
- "ldr x26, [x14, #0x70]\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v22.4s, v4.4s, v12.4s\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "ldr x27, [x14, #0x60]\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla v26.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0x88]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0x78]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v22.4s, v6.4s, v9.4s\n"
- "ldr x11, [x14, #0x80]\n"
- "fmla v25.4s, v4.4s, v9.4s\n"
- "fmla v26.4s, v3.4s, v9.4s\n"
- "add x12, x12, #0x10\n"
- "fmla v20.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "fmla v16.4s, v1.4s, v12.4s\n"
- "fmla v17.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "ldr x28, [x14, #0x98]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "ldr x27, [x14, #0xa0]\n"
- "fmla v22.4s, v7.4s, v10.4s\n"
- "fmla v23.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "ldr x26, [x14, #0xb0]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "ldr x24, [x14, #0xb8]\n"
- "fmla v27.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v11.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
- "ldr q9, [x9, x13]\n"
- "ldr x9, [x14, #0x90]\n"
- "fmla v29.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla v16.4s, v3.4s, v9.4s\n"
- "fmla v20.4s, v0.4s, v9.4s\n"
- "ldr q11, [x11, x13]\n"
- "ldr x11, [x14, #0xc0]\n"
- "fmla v17.4s, v4.4s, v10.4s\n"
- "fmla v18.4s, v3.4s, v10.4s\n"
- "fmla v21.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v5.4s, v10.4s\n"
- "fmla v20.4s, v2.4s, v10.4s\n"
- "ldr q10, [x10, x13]\n"
- "ldr x10, [x14, #0xc8]\n"
- "fmla v17.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v19.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v1.4s, v12.4s\n"
- "fmla v23.4s, v0.4s, v12.4s\n"
- "ldr q12, [x28, x13]\n"
- "ldr x28, [x14, #0xd8]\n"
- "fmla v28.4s, v7.4s, v11.4s\n"
- "fmla v29.4s, v6.4s, v11.4s\n"
- "ldr q11, [x9, x13]\n"
- "ldr x9, [x14, #0xd0]\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v4.4s, v10.4s\n"
- "fmla v21.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x13]\n"
- "ldr x27, [x14, #0xe0]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr q11, [x25, x13]\n"
- "fmla v27.4s, v1.4s, v12.4s\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v5.4s, v12.4s\n"
- "fmla v23.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x13]\n"
- "ldr x26, [x14, #0xf0]\n"
- "fmla v16.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v18.4s, v0.4s, v10.4s\n"
- "ldr q10, [x24, x13]\n"
- "fmla v20.4s, v7.4s, v11.4s\n"
- "ldr x24, [x14, #0xf8]\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x11, x13]\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "ldr x11, [x14, #0x100]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "fmla v19.4s, v0.4s, v12.4s\n"
- "ldr q12, [x10, x13]\n"
- "ldr x10, [x14, #0x108]\n"
- "fmla v16.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x13]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "ldr x9, [x14, #0x110]\n"
- "fmla v23.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x28, x13]\n"
- "fmla v27.4s, v2.4s, v12.4s\n"
- "ldr x28, [x14, #0x118]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "fmla v20.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v3.4s, v10.4s\n"
- "ldr q10, [x25, x13]\n"
- "fmla v25.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v11.4s\n"
- "fmla v27.4s, v5.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v24.4s, v8.4s, v11.4s\n"
- "ldr q11, [x26, x13]\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr q10, [x11, x13]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v11.4s\n"
- "fmla v27.4s, v6.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x10, x13]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x24, x13]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x13]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x13]\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "fmla v22.4s, v2.4s, v11.4s\n"
- "fmla v23.4s, v1.4s, v11.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v15.16b, v30.16b\n fmla v15.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v6.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q16, [x23, x15]\n"
+ "fmla v15.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v19.4s, v1.4s, v12.4s\n"
+ "fmla v20.4s, v8.4s, v12.4s\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v21.4s, v7.4s, v12.4s\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v31.4s, v7.4s, v24.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v9.16b, v30.16b\n fmla v9.4s, v3.4s, v12.4s\n"
+ "mov v11.16b, v30.16b\n fmla v11.4s, v0.4s, v12.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v12.16b, v30.16b\n fmla v12.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x26, x15]\n"
+ "fmla v15.4s, v6.4s, v24.4s\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla v29.4s, v4.4s, v24.4s\n"
+ "fmla v19.4s, v3.4s, v24.4s\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v24.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v0.4s, v24.4s\n"
+ "fmla v18.4s, v8.4s, v24.4s\n"
+ "fmla v27.4s, v5.4s, v24.4s\n"
+ "fmla v10.4s, v2.4s, v24.4s\n"
+ "ldr q24, [x25, x15]\n"
+ "fmla v17.4s, v1.4s, v23.4s\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v20.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v31.4s, v8.4s, v22.4s\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v15.4s, v7.4s, v22.4s\n"
+ "fmla v9.4s, v6.4s, v22.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v11.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v22.4s\n"
+ "fmla v25.4s, v1.4s, v22.4s\n"
+ "fmla v12.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.4s, v3.4s, v24.4s\n"
+ "fmla v18.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v16.4s\n"
+ "fmla v10.4s, v3.4s, v16.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v21.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v28.4s, v5.4s, v23.4s\n"
+ "fmla v9.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v15.4s, v0.4s, v22.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v11.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x21, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ "fmla v20.4s, v5.4s, v23.4s\n"
+ "fmla v21.4s, v4.4s, v23.4s\n"
+ "fmla v31.4s, v2.4s, v23.4s\n"
+ "fmla v28.4s, v3.4s, v23.4s\n"
+ "fmla v15.4s, v1.4s, v23.4s\n"
+ "fmla v9.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v10.4s, v7.4s, v16.4s\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x27, x15]\n"
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v20.4s, v6.4s, v22.4s\n"
+ "fmla v18.4s, v4.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v21.4s, v8.4s, v23.4s\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v11.4s, v1.4s, v23.4s\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v28.4s, v7.4s, v23.4s\n"
+ "fmla v15.4s, v5.4s, v23.4s\n"
+ "fmla v9.4s, v4.4s, v23.4s\n"
+ "fmla v19.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v17.4s, v2.4s, v22.4s\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "fmla v21.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v18.4s, v7.4s, v16.4s\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v31.4s, v6.4s, v16.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v29.4s, v3.4s, v16.4s\n"
+ "fmla v10.4s, v1.4s, v16.4s\n"
+ "fmla v26.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x22, x15]\n"
+ "fmla v11.4s, v4.4s, v16.4s\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v28.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x21, x15]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "fmla v18.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v15.4s, v8.4s, v16.4s\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla v9.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "fmla v12.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x28, x15]\n"
+ "fmla v11.4s, v2.4s, v23.4s\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v10.4s, v0.4s, v22.4s\n"
+ "fmla v26.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v3.4s, v16.4s\n"
+ "fmla v28.4s, v8.4s, v23.4s\n"
+ "fmla v9.4s, v5.4s, v23.4s\n"
+ "ldr q23, [x27, x15]\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v29.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v6.4s, v16.4s\n"
+ "fmla v10.4s, v5.4s, v16.4s\n"
+ "fmla v11.4s, v5.4s, v23.4s\n"
+ "fmla v12.4s, v2.4s, v23.4s\n"
+ "fmla v26.4s, v7.4s, v22.4s\n"
+ "fmla v25.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v10.4s, v8.4s, v22.4s\n"
+ "ldr q30, [x23, x15]\n"
+ "fmla v29.4s, v8.4s, v16.4s\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmla v11.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v4.4s, v16.4s\n"
+ "fmla v12.4s, v3.4s, v16.4s\n"
+ "ldr q24, [x22, x15]\n"
+ "fmla v9.4s, v8.4s, v23.4s\n"
+ "ldr q16, [x24, x15]\n"
+ "fmla v17.4s, v4.4s, v30.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v20.4s, v3.4s, v30.4s\n"
+ "fmla v21.4s, v5.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v24.4s\n"
+ "fmla v26.4s, v8.4s, v16.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmla v25.4s, v7.4s, v16.4s\n"
+ "fmla v12.4s, v6.4s, v16.4s\n"
+ "ldr q23, [x21, x15]\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v18.4s, v1.4s, v30.4s\n"
+ "fmla v31.4s, v0.4s, v30.4s\n"
+ "ldr q16, [x20, x15]\n"
"fmin v17.4s, v17.4s, v14.4s\n"
- "str q16, [x23, x12]\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "str q17, [x22, x12]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q18, [x21, x12]\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "str q19, [x20, x12]\n"
- "ldr x23, [x16, #0x20]\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "ldr x22, [x16, #0x28]\n"
- "ldr x21, [x16, #0x30]\n"
- "ldr x20, [x16, #0x38]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v15.4s, v2.4s, v24.4s\n"
+ "fmla v9.4s, v1.4s, v24.4s\n"
"fmin v20.4s, v20.4s, v14.4s\n"
- "fmla v30.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "str q17, [x12, x14]\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v29.4s, v6.4s, v23.4s\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "str q20, [x23, x12]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q21, [x22, x12]\n"
- "ldr x23, [x16, #0x40]\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "str q22, [x21, x12]\n"
- "ldr x22, [x16, #0x48]\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "str q23, [x20, x12]\n"
- "ldr x21, [x16, #0x50]\n"
- "ldr x20, [x16, #0x58]\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "str q24, [x23, x12]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q25, [x22, x12]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "str q26, [x21, x12]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "str q27, [x20, x12]\n"
- "ldr x21, [x16, #0x70]\n"
- "ldr x20, [x16, #0x78]\n"
+ "str q20, [x11, x14]\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "fmla v11.4s, v7.4s, v16.4s\n"
"fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q28, [x23, x12]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
+ "str q21, [x10, x14]\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "str q28, [x9, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmax v15.4s, v15.4s, v13.4s\n"
+ "fmax v9.4s, v9.4s, v13.4s\n"
+ "ldr x22, [x8, #0x28]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v10.4s, v4.4s, v23.4s\n"
+ "fmla v26.4s, v3.4s, v23.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmla v25.4s, v5.4s, v16.4s\n"
+ "fmla v12.4s, v4.4s, v16.4s\n"
"fmin v31.4s, v31.4s, v14.4s\n"
- "str q29, [x22, x12]\n"
- "add x13, x13, #0x10\n"
- "str q30, [x21, x12]\n"
- "str q31, [x20, x12]\n"
+ "str q18, [x23, x14]\n"
+ "fmin v15.4s, v15.4s, v14.4s\n"
+ "fmin v9.4s, v9.4s, v14.4s\n"
+ "str q31, [x22, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "str q15, [x21, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v11.4s, v11.4s, v13.4s\n"
+ "str q9, [x20, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q27, [x23, x14]\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v11.4s, v11.4s, v14.4s\n"
+ "str q29, [x22, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "str q19, [x21, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v12.4s, v12.4s, v13.4s\n"
+ "str q11, [x20, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q10, [x23, x14]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v12.4s, v12.4s, v14.4s\n"
+ "str q26, [x22, x14]\n"
+ "add x15, x15, #0x10\n"
+ "str q25, [x21, x14]\n"
+ "str q12, [x20, x14]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 72f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x12, x13\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x23, [x14, #0x0]\n"
- "ldr x22, [x14, #0x8]\n"
- "add x23, x23, x13\n"
- "add x22, x22, x13\n"
- "ldr x21, [x14, #0x10]\n"
- "ldr x20, [x14, #0x18]\n"
- "add x21, x21, x13\n"
- "add x20, x20, x13\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "mov x14, x15\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x23], #0x8\n"
"ld1 { v10.d }[0], [x22], #0x8\n"
@@ -738,28 +738,28 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ld1 { v11.s }[0], [x21], #0x4\n"
"ld1 { v12.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "ldr x20, [x14, #0x20]\n"
- "add x20, x20, x13\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr x20, [x16, #0x20]\n"
+ "add x20, x20, x15\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v2.4s, v9.4s\n"
"fmla v16.4s, v0.4s, v10.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
"fmla v18.4s, v7.4s, v12.4s\n"
"fmla v19.4s, v6.4s, v12.4s\n"
"fmla v21.4s, v5.4s, v12.4s\n"
"fmla v22.4s, v4.4s, v12.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v23.16b, v30.16b\n fmla v23.4s, v3.4s, v12.4s\n"
"fmla v25.4s, v2.4s, v12.4s\n"
"fmla v26.4s, v1.4s, v12.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 6f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
@@ -768,9 +768,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"6:" // Oddments: Load input (5, 0): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (5, 0): Bit 1: End
- "ldr x20, [x14, #0x28]\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x28]\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -779,9 +779,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (5, 5): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (5, 5): Bit 1: End
- "ldr x20, [x14, #0x30]\n"
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 10f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
@@ -790,17 +790,17 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (3, 2): Bit 1: Unset
"ld1 { v9.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (3, 2): Bit 1: End
- "ldr x20, [x14, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmla v20.4s, v8.4s, v9.4s\n"
"fmla v21.4s, v7.4s, v9.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.4s, v6.4s, v9.4s\n"
"fmla v24.4s, v5.4s, v9.4s\n"
"fmla v25.4s, v4.4s, v9.4s\n"
"fmla v26.4s, v3.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -809,10 +809,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"12:" // Oddments: Load input (0, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 1): Bit 1: End
- "ldr x20, [x14, #0x40]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla v16.4s, v1.4s, v12.4s\n"
"fmla v17.4s, v0.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -821,10 +821,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (0, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (0, 4): Bit 1: End
- "ldr x20, [x14, #0x48]\n"
+ "ldr x20, [x16, #0x48]\n"
"fmla v18.4s, v2.4s, v11.4s\n"
"fmla v19.4s, v1.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 16f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
@@ -833,10 +833,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"16:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (3, 3): Bit 1: End
- "ldr x20, [x14, #0x50]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla v21.4s, v8.4s, v10.4s\n"
"fmla v22.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v23.4s, v6.4s, v10.4s\n"
"fmla v25.4s, v5.4s, v10.4s\n"
"fmla v26.4s, v4.4s, v10.4s\n"
@@ -852,10 +852,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (1, 0): Bit 1: Unset
"ld1 { v9.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 0): Bit 1: End
- "ldr x20, [x14, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla v16.4s, v3.4s, v9.4s\n"
"fmla v20.4s, v0.4s, v9.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -864,10 +864,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"20:" // Oddments: Load input (1, 5): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (1, 5): Bit 1: End
- "ldr x20, [x14, #0x60]\n"
+ "ldr x20, [x16, #0x60]\n"
"fmla v19.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v2.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -876,10 +876,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (4, 0): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (4, 0): Bit 1: End
- "ldr x20, [x14, #0x68]\n"
+ "ldr x20, [x16, #0x68]\n"
"fmla v24.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -888,10 +888,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (1, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (1, 2): Bit 1: End
- "ldr x20, [x14, #0x70]\n"
+ "ldr x20, [x16, #0x70]\n"
"fmla v16.4s, v5.4s, v10.4s\n"
"fmla v17.4s, v4.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.4s, v3.4s, v10.4s\n"
"fmla v20.4s, v2.4s, v10.4s\n"
"fmla v21.4s, v1.4s, v10.4s\n"
@@ -904,10 +904,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (4, 5): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 5): Bit 1: End
- "ldr x20, [x14, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmla v27.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v5.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 28f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
@@ -916,10 +916,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"28:" // Oddments: Load input (1, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x20, [x14, #0x80]\n"
+ "ldr x20, [x16, #0x80]\n"
"fmla v17.4s, v5.4s, v12.4s\n"
"fmla v18.4s, v4.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.4s, v3.4s, v12.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
"fmla v22.4s, v1.4s, v12.4s\n"
@@ -932,10 +932,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (5, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (5, 1): Bit 1: End
- "ldr x20, [x14, #0x88]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla v28.4s, v7.4s, v11.4s\n"
"fmla v29.4s, v6.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 32f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
@@ -944,10 +944,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"32:" // Oddments: Load input (2, 1): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (2, 1): Bit 1: End
- "ldr x20, [x14, #0x90]\n"
+ "ldr x20, [x16, #0x90]\n"
"fmla v16.4s, v7.4s, v10.4s\n"
"fmla v17.4s, v6.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.4s, v4.4s, v10.4s\n"
"fmla v21.4s, v3.4s, v10.4s\n"
"fmla v24.4s, v1.4s, v10.4s\n"
@@ -960,10 +960,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (5, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (5, 4): Bit 1: End
- "ldr x20, [x14, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
"fmla v30.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 36f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
@@ -972,10 +972,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"36:" // Oddments: Load input (2, 4): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (2, 4): Bit 1: End
- "ldr x20, [x14, #0xa0]\n"
+ "ldr x20, [x16, #0xa0]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
"fmla v19.4s, v7.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v26.4s, v2.4s, v12.4s\n"
@@ -988,10 +988,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (0, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (0, 2): Bit 1: End
- "ldr x20, [x14, #0xa8]\n"
+ "ldr x20, [x16, #0xa8]\n"
"fmla v16.4s, v2.4s, v10.4s\n"
"fmla v17.4s, v1.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v18.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 40f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1001,10 +1001,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"40:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (3, 1): Bit 1: End
- "ldr x20, [x14, #0xb0]\n"
+ "ldr x20, [x16, #0xb0]\n"
"fmla v20.4s, v7.4s, v11.4s\n"
"fmla v21.4s, v6.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.4s, v4.4s, v11.4s\n"
"fmla v25.4s, v3.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
@@ -1017,10 +1017,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (0, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (0, 3): Bit 1: End
- "ldr x20, [x14, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla v17.4s, v2.4s, v12.4s\n"
"fmla v18.4s, v1.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v19.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1030,10 +1030,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"44:" // Oddments: Load input (2, 0): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"45:" // Oddments: Load input (2, 0): Bit 1: End
- "ldr x20, [x14, #0xc0]\n"
+ "ldr x20, [x16, #0xc0]\n"
"fmla v16.4s, v6.4s, v10.4s\n"
"fmla v20.4s, v3.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v24.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 46f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1043,10 +1043,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"47:" // Oddments: Load input (3, 4): Bit 1: End
- "ldr x20, [x14, #0xc8]\n"
+ "ldr x20, [x16, #0xc8]\n"
"fmla v22.4s, v8.4s, v11.4s\n"
"fmla v23.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.4s, v5.4s, v11.4s\n"
"fmla v27.4s, v4.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v11.4s\n"
@@ -1059,10 +1059,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"48:" // Oddments: Load input (2, 5): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"49:" // Oddments: Load input (2, 5): Bit 1: End
- "ldr x20, [x14, #0xd0]\n"
+ "ldr x20, [x16, #0xd0]\n"
"fmla v19.4s, v8.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 50f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1072,10 +1072,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (3, 0): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"51:" // Oddments: Load input (3, 0): Bit 1: End
- "ldr x20, [x14, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
"fmla v20.4s, v6.4s, v10.4s\n"
"fmla v24.4s, v3.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 52f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1085,10 +1085,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"52:" // Oddments: Load input (4, 2): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"53:" // Oddments: Load input (4, 2): Bit 1: End
- "ldr x20, [x14, #0xe0]\n"
+ "ldr x20, [x16, #0xe0]\n"
"fmla v24.4s, v8.4s, v11.4s\n"
"fmla v25.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v26.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
"fmla v29.4s, v4.4s, v11.4s\n"
@@ -1101,10 +1101,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (3, 5): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"55:" // Oddments: Load input (3, 5): Bit 1: End
- "ldr x20, [x14, #0xe8]\n"
+ "ldr x20, [x16, #0xe8]\n"
"fmla v23.4s, v8.4s, v12.4s\n"
"fmla v27.4s, v5.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 56f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1114,10 +1114,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"56:" // Oddments: Load input (5, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"57:" // Oddments: Load input (5, 2): Bit 1: End
- "ldr x20, [x14, #0xf0]\n"
+ "ldr x20, [x16, #0xf0]\n"
"fmla v28.4s, v8.4s, v10.4s\n"
"fmla v29.4s, v7.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v30.4s, v6.4s, v10.4s\n"
"tbz %x[n_channels], #1, 58f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
@@ -1127,10 +1127,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (4, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"59:" // Oddments: Load input (4, 3): Bit 1: End
- "ldr x20, [x14, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v26.4s, v7.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v27.4s, v6.4s, v11.4s\n"
"fmla v29.4s, v5.4s, v11.4s\n"
"fmla v30.4s, v4.4s, v11.4s\n"
@@ -1143,10 +1143,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"60:" // Oddments: Load input (5, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"61:" // Oddments: Load input (5, 3): Bit 1: End
- "ldr x20, [x14, #0x100]\n"
+ "ldr x20, [x16, #0x100]\n"
"fmla v29.4s, v8.4s, v12.4s\n"
"fmla v30.4s, v7.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v31.4s, v6.4s, v12.4s\n"
"tbz %x[n_channels], #1, 62f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
@@ -1156,10 +1156,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (1, 1): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"63:" // Oddments: Load input (1, 1): Bit 1: End
- "ldr x20, [x14, #0x108]\n"
+ "ldr x20, [x16, #0x108]\n"
"fmla v16.4s, v4.4s, v10.4s\n"
"fmla v17.4s, v3.4s, v10.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v20.4s, v1.4s, v10.4s\n"
"fmla v21.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -1170,10 +1170,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"64:" // Oddments: Load input (1, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"65:" // Oddments: Load input (1, 4): Bit 1: End
- "ldr x20, [x14, #0x110]\n"
+ "ldr x20, [x16, #0x110]\n"
"fmla v18.4s, v5.4s, v11.4s\n"
"fmla v19.4s, v4.4s, v11.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v22.4s, v2.4s, v11.4s\n"
"fmla v23.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 66f\n"
@@ -1184,10 +1184,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (4, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"67:" // Oddments: Load input (4, 1): Bit 1: End
- "ldr x20, [x14, #0x118]\n"
+ "ldr x20, [x16, #0x118]\n"
"fmla v24.4s, v7.4s, v12.4s\n"
"fmla v25.4s, v6.4s, v12.4s\n"
- "add x20, x20, x13\n"
+ "add x20, x20, x15\n"
"fmla v28.4s, v4.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -1200,24 +1200,24 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"69:" // Oddments: Load input (4, 4): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
"fmin v17.4s, v17.4s, v14.4s\n"
"fmin v18.4s, v18.4s, v14.4s\n"
@@ -1235,150 +1235,150 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmin v30.4s, v30.4s, v14.4s\n"
"fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 70f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.d }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.d }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.d }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.d }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.d }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.d }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.d }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.d }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.d }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.d }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.d }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.d }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
- "add x12, x12, #0x8\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
"st1 { v28.d }[0], [x23]\n"
"st1 { v29.d }[0], [x22]\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 71f\n"
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[2], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[2], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[2], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[2], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[2], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[2], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[2], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[2], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[2], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[2], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[2], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[2], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.s }[2], [x23]\n"
"st1 { v29.s }[2], [x22]\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Store: Bit 1: Unset
- "ldr x23, [x16, #0x0]\n"
- "ldr x22, [x16, #0x8]\n"
- "add x23, x23, x12\n"
- "add x22, x22, x12\n"
- "ldr x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x18]\n"
- "add x21, x21, x12\n"
- "add x20, x20, x12\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v16.s }[0], [x23]\n"
- "ldr x23, [x16, #0x20]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
"st1 { v17.s }[0], [x22]\n"
- "ldr x22, [x16, #0x28]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
"st1 { v18.s }[0], [x21]\n"
- "ldr x21, [x16, #0x30]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
"st1 { v19.s }[0], [x20]\n"
- "ldr x20, [x16, #0x38]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
"st1 { v20.s }[0], [x23]\n"
- "ldr x23, [x16, #0x40]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
"st1 { v21.s }[0], [x22]\n"
- "ldr x22, [x16, #0x48]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
"st1 { v22.s }[0], [x21]\n"
- "ldr x21, [x16, #0x50]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
"st1 { v23.s }[0], [x20]\n"
- "ldr x20, [x16, #0x58]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
"st1 { v24.s }[0], [x23]\n"
- "ldr x23, [x16, #0x60]\n"
- "add x23, x23, x12\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
"st1 { v25.s }[0], [x22]\n"
- "ldr x22, [x16, #0x68]\n"
- "add x22, x22, x12\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
"st1 { v26.s }[0], [x21]\n"
- "ldr x21, [x16, #0x70]\n"
- "add x21, x21, x12\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
"st1 { v27.s }[0], [x20]\n"
- "ldr x20, [x16, #0x78]\n"
- "add x20, x20, x12\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
"st1 { v28.s }[0], [x23]\n"
"st1 { v29.s }[0], [x22]\n"
"st1 { v30.s }[0], [x21]\n"
@@ -1387,11 +1387,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"72:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index ff521fb2ca..f727efea80 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index e42ceffb50..5ab61fad4c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -106,7 +106,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
"mov x23, #0x10\n" // cntb _, ALL, #1
"mul x22, x22, x26\n" // offset *= kernel_stride * output_size
- "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x16, x8, x24, LSL #2\n"
"ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
@@ -118,9 +118,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x11, x13, x6\n"
"add x17, x17, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x10, x12, x24, LSL #2\n"
"add x9, x11, x6\n"
"add x28, x17, x21, LSL #2\n"
@@ -128,7 +128,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x15, #0x20]\n"
@@ -150,179 +150,179 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr q16, [x8, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
"add x23, x23, #0x10\n"
"add x8, x8, #0x10\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x8]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "ld1 { v14.4s }, [x12]\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q21, [x16, x9]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ld1 { v20.4s }, [x12]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
"add x16, x16, #0x10\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "ld1 { v15.4s }, [x14]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q17, [x15, #0x0]\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ld1 { v25.4s }, [x14]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x12, x6]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q18, [x14, x6]\n"
+ "fmla v28.4s, v5.4s, v21.4s\n"
+ "ldr q24, [x14, x11]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x15, #0x0]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x12, x11]\n"
+ "fmla v29.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x11]\n"
"add x20, x20, #0x10\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
+ "fmla v23.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
+ "ldr q21, [x10, x6]\n"
+ "fmla v23.4s, v0.4s, v25.4s\n"
"ldr q0, [x15, #0x10]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v24.4s\n"
"add x21, x21, #0x10\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x14, x9]\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "ldr q20, [x14, x9]\n"
"ldr q4, [x15, #0x50]\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "ld1 { v15.4s }, [x10]\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
+ "fmla v22.4s, v5.4s, v16.4s\n"
+ "ldr q19, [x10, x11]\n"
+ "fmla v29.4s, v6.4s, v25.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
"ldr q1, [x15, #0x20]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v22.4s, v2.4s, v20.4s\n"
"ldr q2, [x15, #0x30]\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v29.4s, v7.4s, v18.4s\n"
"ldr q16, [x12, x13]\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
+ "fmla v23.4s, v6.4s, v17.4s\n"
+ "ldr q18, [x10, x13]\n"
+ "fmla v22.4s, v3.4s, v16.4s\n"
"ldr q3, [x15, #0x40]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
"ldr q13, [x8, x9]\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
+ "fmla v22.4s, v7.4s, v19.4s\n"
"ld1 { v14.4s }, [x16]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v24.4s\n"
"ldr q12, [x8, x11]\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
"ldr q16, [x8, x13]\n"
"ldr q5, [x15, #0x60]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q11, [x10, x9]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "fmla v28.4s, v8.4s, v20.4s\n"
+ "ldr q17, [x10, x9]\n"
"ldr q6, [x15, #0x70]\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v17.4s\n"
"ldr q11, [x8, x6]\n"
"ldr q15, [x16, x6]\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
"add x14, x14, #0x10\n"
"ldr q9, [x14, x13]\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
"add x12, x12, #0x10\n"
"add x10, x10, #0x10\n"
- "st1 { v28.4s }, [x17]\n"
+ "st1 { v29.4s }, [x17]\n"
"add x15, x15, #0xa0\n"
- "str q29, [x17, x7]\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "st1 { v30.4s }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "st1 { v23.4s }, [x28]\n"
+ "str q22, [x28, x7]\n"
"add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
"add x8, x8, #0x10\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x16, x9]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x11]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x16, x13]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "ld1 { v14.4s }, [x12]\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x16, x9]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ld1 { v19.4s }, [x12]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
"add x16, x16, #0x10\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "ld1 { v15.4s }, [x14]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x12, x6]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "ldr q16, [x14, x6]\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr q12, [x14, x11]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x12, x11]\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "ldr q14, [x12, x9]\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr q13, [x10, x6]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x14, x9]\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "ldr q14, [x10, x11]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "ld1 { v15.4s }, [x10]\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ld1 { v25.4s }, [x14]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q18, [x12, x6]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q24, [x14, x6]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "ldr q23, [x14, x11]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x11]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v21.4s, v4.4s, v17.4s\n"
+ "ldr q20, [x10, x6]\n"
+ "fmla v22.4s, v0.4s, v25.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v22.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x14, x9]\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "ldr q18, [x10, x11]\n"
+ "fmla v29.4s, v6.4s, v25.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v22.4s, v1.4s, v24.4s\n"
"add x14, x14, #0x10\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "fmla v29.4s, v7.4s, v24.4s\n"
"ldr q16, [x12, x13]\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "ldr q15, [x10, x13]\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
- "st1 { v28.4s }, [x17]\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v22.4s, v6.4s, v17.4s\n"
+ "ldr q17, [x10, x13]\n"
+ "fmla v21.4s, v3.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmla v22.4s, v7.4s, v20.4s\n"
+ "fmla v21.4s, v7.4s, v18.4s\n"
+ "st1 { v29.4s }, [x17]\n"
"add x12, x12, #0x10\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q11, [x10, x9]\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmla v28.4s, v7.4s, v23.4s\n"
+ "fmla v22.4s, v5.4s, v16.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x10, x9]\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmla v22.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
"add x10, x10, #0x10\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "str q29, [x17, x7]\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "st1 { v30.4s }, [x28]\n"
- "str q31, [x28, x7]\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "st1 { v22.4s }, [x28]\n"
+ "str q21, [x28, x7]\n"
"add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 43f\n"
- "ldr q17, [x15, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
"add x27, x14, x13\n"
"add x26, x8, XZR\n"
@@ -369,17 +369,17 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s15, [x21, #0x0]\n"
"ldr s16, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"add x20, x16, x11\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v28.4s, v3.4s, v14.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
"fmla v28.4s, v4.4s, v15.4s\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v16.4s\n"
"fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 7f\n"
@@ -558,14 +558,14 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s11, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
"fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v30.4s, v30.4s, v26.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v27.4s\n"
"tbz %x[n_channels], #1, 41f\n"
"mov x21, x17\n"
"mov x20, x28\n"
@@ -591,7 +591,6 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"42:" // Tile loop: Oddments: Store: Bit 1: End
-
"43:" // Tile loop: End
"ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -606,11 +605,11 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index f65633002e..24fe255dfb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -88,258 +88,258 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "lsr x25, %x[n_channels], #0x2\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x25, #0x10\n" // cntb _, ALL, #1
+ "lsr x24, %x[n_channels], #0x2\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
- "sub x23, XZR, x26\n"
- "cbz x25, 3f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "cmp x26, x25, LSL #4\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
- "add x24, x24, #0xa0\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x28]\n"
+ "sub x22, XZR, x25\n"
+ "cbz x24, 3f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "cmp x25, x24, LSL #4\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "add x23, x23, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
"ldr q10, [x20, x28]\n"
"ldp x21, x20, [x13, #0x10]\n"
"ldr q11, [x21, x28]\n"
"ldr q12, [x20, x28]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x28]\n"
- "ldr q14, [x21, x28]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x28]\n"
+ "ldr q14, [x20, x28]\n"
"ldp x21, x20, [x13, #0x30]\n"
"ldr q15, [x21, x28]\n"
"ldr q16, [x20, x28]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v8.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q19, [x21, x28]\n"
+ "fmla v23.4s, v2.4s, v13.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v24.4s, v3.4s, v14.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr q17, [x24, #0x0]\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x20, x28]\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
- "ldr q0, [x24, #0x10]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.4s, v4.4s, v15.4s\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q22, [x20, x28]\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v23.4s, v5.4s, v20.4s\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q21, [x20, x28]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+ "mov v19.16b, v31.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x23, #0x0]\n"
+ "fmla v24.4s, v5.4s, v18.4s\n"
+ "fmla v23.4s, v3.4s, v18.4s\n"
"ldr q16, [x21, x28]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "ldr q4, [x24, #0x50]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v19.4s, v4.4s, v16.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v20.4s, v0.4s, v22.4s\n"
+ "ldr q0, [x23, #0x10]\n"
+ "fmla v19.4s, v1.4s, v21.4s\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v20.4s, v4.4s, v18.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "ldr q4, [x23, #0x50]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
- "ldr q11, [x20, x28]\n"
- "ldr q1, [x24, #0x20]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
- "ldr q2, [x24, #0x30]\n"
- "ldr x21, [x13, #0x90]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q15, [x21, x28]\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "ldr q13, [x22, x28]\n"
- "ldr q3, [x24, #0x40]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr x21, [x13, #0xb0]\n"
- "add x23, x23, #0x10\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "ldr q14, [x21, x28]\n"
+ "fmla v24.4s, v6.4s, v22.4s\n"
+ "fmla v20.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q1, [x23, #0x20]\n"
+ "fmla v19.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v17.4s\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v19.4s, v3.4s, v17.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmla v20.4s, v5.4s, v17.4s\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "add x22, x22, #0x10\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
"ldr x20, [x13, #0xb8]\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "ldr q15, [x20, x28]\n"
- "ldr q7, [x24, #0x80]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "ldr q8, [x24, #0x90]\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "ldp x22, x20, [x13, #0x0]\n"
- "ldr q9, [x22, x26]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "fmla v19.4s, v6.4s, v16.4s\n"
+ "fmla v20.4s, v8.4s, v16.4s\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x23, #0x90]\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x25]\n"
+ "fmin v19.4s, v19.4s, v27.4s\n"
"add x28, x28, #0x10\n"
- "ldr q10, [x20, x26]\n"
+ "ldr q10, [x20, x25]\n"
"ldp x21, x20, [x13, #0x10]\n"
- "str q28, [x12, x23]\n"
- "add x24, x24, #0xa0\n"
- "ldr q11, [x21, x26]\n"
- "ldr q12, [x20, x26]\n"
- "str q29, [x11, x23]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldr q13, [x22, x26]\n"
- "str q30, [x10, x23]\n"
- "ldr q14, [x21, x26]\n"
+ "str q24, [x12, x22]\n"
+ "add x23, x23, #0xa0\n"
+ "ldr q11, [x21, x25]\n"
+ "ldr q12, [x20, x25]\n"
+ "str q23, [x11, x22]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x25]\n"
+ "str q20, [x10, x22]\n"
+ "ldr q14, [x20, x25]\n"
"ldp x21, x20, [x13, #0x30]\n"
- "str q31, [x9, x23]\n"
- "ldr q15, [x21, x26]\n"
- "ldr q16, [x20, x26]\n"
- "add x26, x26, #0x10\n"
- "cmp x26, x25, LSL #4\n"
+ "str q19, [x9, x22]\n"
+ "ldr q15, [x21, x25]\n"
+ "ldr q16, [x20, x25]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x25, x24, LSL #4\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x22, [x13, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v8.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x13, #0x40]\n"
"ldr x20, [x13, #0x48]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q12, [x20, x28]\n"
- "ldr x21, [x13, #0x50]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q13, [x21, x28]\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "fmla v24.4s, v2.4s, v13.4s\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v25.4s, v3.4s, v14.4s\n"
+ "fmla v24.4s, v0.4s, v16.4s\n"
"ldr x20, [x13, #0x58]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v28.4s, v4.4s, v15.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr x20, [x13, #0x78]\n"
- "ldr x22, [x13, #0x60]\n"
- "ldr q15, [x22, x28]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "ldr x22, [x13, #0x80]\n"
- "ldr q12, [x22, x28]\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr q11, [x21, x28]\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q13, [x20, x28]\n"
- "fmla v30.4s, v3.4s, v14.4s\n"
- "fmla v31.4s, v4.4s, v13.4s\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q14, [x20, x28]\n"
- "fmla v30.4s, v0.4s, v15.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v25.4s, v4.4s, v15.4s\n"
+ "fmla v24.4s, v4.4s, v18.4s\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q23, [x20, x28]\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v5.4s, v20.4s\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q22, [x20, x28]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v2.4s, v9.4s\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v24.4s, v3.4s, v19.4s\n"
"ldr q16, [x21, x28]\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v20.4s, v4.4s, v16.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v0.4s, v23.4s\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
"ldr x20, [x13, #0x98]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x20, x28]\n"
- "fmla v31.4s, v5.4s, v14.4s\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "ldr x21, [x13, #0x90]\n"
- "ldr q15, [x21, x28]\n"
- "fmla v30.4s, v1.4s, v16.4s\n"
- "ldr x21, [x13, #0xa8]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v7.4s, v16.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x13, #0xa0]\n"
- "ldr q13, [x22, x28]\n"
- "fmla v30.4s, v6.4s, v15.4s\n"
- "fmla v31.4s, v3.4s, v16.4s\n"
- "ldr x21, [x13, #0xb0]\n"
- "ldr q14, [x21, x28]\n"
- "fmla v30.4s, v7.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v14.4s\n"
+ "fmla v21.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v20.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v6.4s, v23.4s\n"
+ "ldr x20, [x13, #0x90]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v1.4s, v17.4s\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.4s, v2.4s, v19.4s\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v20.4s, v3.4s, v18.4s\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
"ldr x20, [x13, #0xb8]\n"
- "ldr q15, [x20, x28]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v30.4s, v5.4s, v16.4s\n"
- "ldr x22, [x13, #0xc0]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q11, [x22, x28]\n"
- "fmla v30.4s, v8.4s, v15.4s\n"
- "fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "add x23, x23, #0x10\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.4s, v7.4s, v22.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v20.4s, v6.4s, v17.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "fmla v20.4s, v8.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "add x22, x22, #0x10\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
"add x28, x28, #0x10\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "str q28, [x12, x23]\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "str q29, [x11, x23]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "str q30, [x10, x23]\n"
- "str q31, [x9, x23]\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "str q25, [x12, x22]\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "str q24, [x11, x22]\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "str q21, [x10, x22]\n"
+ "str q20, [x9, x22]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 42f\n"
- "ldr q17, [x24, #0x0]\n"
- "ldr q0, [x24, #0x10]\n"
- "mov x23, x28\n"
- "add x12, x12, x23\n"
- "ldr q1, [x24, #0x20]\n"
- "ldr q2, [x24, #0x30]\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "ldr q3, [x24, #0x40]\n"
- "ldr q4, [x24, #0x50]\n"
- "add x9, x9, x23\n"
- "ldr q5, [x24, #0x60]\n"
- "ldr q6, [x24, #0x70]\n"
- "ldr q7, [x24, #0x80]\n"
- "ldr q8, [x24, #0x90]\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
"ldr x27, [x13, #0x0]\n"
"ldr x26, [x13, #0x8]\n"
"add x27, x27, x28\n"
@@ -385,18 +385,18 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v15.s }[0], [x21], #0x4\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
- "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"ldr x20, [x13, #0x40]\n"
"add x20, x20, x28\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v28.4s, v3.4s, v14.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
"fmla v28.4s, v4.4s, v15.4s\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v16.4s\n"
"fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 6f\n"
@@ -591,14 +591,14 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v11.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (4, 4): Bit 1: End
"fmla v31.4s, v8.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmin v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v30.4s, v30.4s, v26.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v27.4s\n"
"tbz %x[n_channels], #1, 40f\n"
"st1 { v28.d }[0], [x12], #0x8\n"
"st1 { v29.d }[0], [x11], #0x8\n"
@@ -619,11 +619,11 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"42:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index c88a7d57ce..de8a1e4514 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -33,8 +33,8 @@
namespace arm_conv {
namespace depthwise {
-void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 6ca3976f02..3426fbc3f9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -120,9 +120,9 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"add x13, x15, x2\n"
"add x5, x5, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x12, x14, x24, LSL #2\n"
"add x11, x13, x2\n"
"add x10, x5, x21, LSL #2\n"
@@ -130,7 +130,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"mov x21, #0x0\n"
"sub x20, XZR, x23\n"
"cbz x22, 4f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
"ldr q1, [x8, #0x20]\n"
@@ -150,366 +150,366 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1 { v14.4s }, [x17]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+ "ldr q23, [x7, x15]\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v6.4s\n"
"add x23, x23, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x8, #0x0]\n"
- "ldr q16, [x8, #0x140]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x140]\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
"add x7, x7, #0x10\n"
- "fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
"ldr q1, [x8, #0x10]\n"
"cmp x23, x22, LSL #4\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q18, [x4, x11]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
"add x4, x4, #0x10\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x8, #0x20]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v23.4s\n"
+ "ldr q17, [x8, #0x20]\n"
"add x20, x20, #0x10\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
"add x21, x21, #0x10\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
+ "fmla v29.4s, v3.4s, v23.4s\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "ldr q16, [x8, #0x30]\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v31.4s, v4.4s, v18.4s\n"
+ "ldr q0, [x17, x15]\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q20, [x8, #0x40]\n"
+ "fmla v30.4s, v19.4s, v7.4s\n"
"ld1 { v7.4s }, [x7]\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x17, x11]\n"
- "fmla v29.4s, v1.4s, v13.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v31.4s, v19.4s, v8.4s\n"
+ "fmla v29.4s, v19.4s, v14.4s\n"
+ "fmla v28.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "ldr q26, [x17, x11]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v2.4s\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v30.4s, v17.4s, v13.4s\n"
+ "ldr q1, [x17, x13]\n"
+ "fmla v31.4s, v17.4s, v23.4s\n"
"add x17, x17, #0x10\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ld1 { v5.4s }, [x16]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x16, x2]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v17.4s, v2.4s\n"
+ "fmla v28.4s, v17.4s, v0.4s\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v30.4s, v16.4s, v23.4s\n"
+ "ld1 { v24.4s }, [x16]\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "fmla v29.4s, v16.4s, v0.4s\n"
+ "fmla v28.4s, v16.4s, v1.4s\n"
+ "ldr q16, [x8, #0x80]\n"
+ "fmla v30.4s, v20.4s, v21.4s\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v31.4s, v20.4s, v10.4s\n"
+ "ldr q22, [x16, x6]\n"
+ "fmla v29.4s, v20.4s, v1.4s\n"
+ "fmla v28.4s, v20.4s, v26.4s\n"
+ "ldr q21, [x8, #0x90]\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "ldr q5, [x16, x11]\n"
+ "fmla v31.4s, v19.4s, v6.4s\n"
+ "fmla v29.4s, v19.4s, v24.4s\n"
+ "fmla v28.4s, v19.4s, v23.4s\n"
+ "ldr q11, [x8, #0xa0]\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v31.4s, v18.4s, v2.4s\n"
+ "fmla v29.4s, v18.4s, v23.4s\n"
+ "fmla v28.4s, v18.4s, v22.4s\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
"add x16, x16, #0x10\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ld1 { v9.4s }, [x14]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v17.4s, v22.4s\n"
+ "fmla v28.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v30.4s, v16.4s, v0.4s\n"
+ "ld1 { v0.4s }, [x14]\n"
+ "fmla v31.4s, v16.4s, v1.4s\n"
+ "fmla v29.4s, v16.4s, v20.4s\n"
+ "fmla v28.4s, v16.4s, v19.4s\n"
+ "ldr q16, [x8, #0xd0]\n"
+ "fmla v30.4s, v21.4s, v1.4s\n"
+ "ldr q4, [x14, x2]\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "ldr q12, [x14, x13]\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "fmla v28.4s, v21.4s, v5.4s\n"
+ "ldr q13, [x8, #0xe0]\n"
+ "fmla v30.4s, v11.4s, v24.4s\n"
+ "ldr q6, [x14, x6]\n"
+ "fmla v31.4s, v11.4s, v23.4s\n"
+ "fmla v29.4s, v11.4s, v0.4s\n"
+ "fmla v28.4s, v11.4s, v4.4s\n"
+ "ldr q24, [x8, #0xf0]\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "ldr q26, [x14, x15]\n"
+ "fmla v31.4s, v18.4s, v22.4s\n"
+ "fmla v29.4s, v18.4s, v4.4s\n"
+ "fmla v28.4s, v18.4s, v6.4s\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v30.4s, v17.4s, v22.4s\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v31.4s, v17.4s, v20.4s\n"
"add x14, x14, #0x10\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v17.4s, v6.4s\n"
+ "fmla v28.4s, v17.4s, v26.4s\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v30.4s, v16.4s, v20.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v31.4s, v16.4s, v19.4s\n"
+ "fmla v29.4s, v16.4s, v26.4s\n"
+ "fmla v28.4s, v16.4s, v12.4s\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v30.4s, v13.4s, v19.4s\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v31.4s, v13.4s, v5.4s\n"
"ld1 { v14.4s }, [x17]\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x130]\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
+ "fmla v29.4s, v13.4s, v12.4s\n"
+ "fmla v28.4s, v13.4s, v22.4s\n"
+ "ldr q19, [x8, #0x130]\n"
+ "fmla v30.4s, v24.4s, v0.4s\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v31.4s, v24.4s, v4.4s\n"
+ "fmla v29.4s, v24.4s, v18.4s\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v28.4s, v24.4s, v17.4s\n"
"ldr q0, [x8, #0x150]\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v23.4s, v4.4s\n"
"ldr q13, [x7, x6]\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v23.4s, v6.4s\n"
+ "fmla v29.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v28.4s, v23.4s, v16.4s\n"
"ldr q1, [x8, #0x160]\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v21.4s, v6.4s\n"
"ld1 { v5.4s }, [x4]\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v29.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v28.4s, v21.4s, v18.4s\n"
"ldr q2, [x8, #0x170]\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v20.4s, v26.4s\n"
"ldr q6, [x4, x2]\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
+ "fmla v31.4s, v20.4s, v12.4s\n"
"add x12, x12, #0x10\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v20.4s, v18.4s\n"
"ldr q11, [x4, x15]\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v20.4s, v17.4s\n"
"ldr q3, [x8, #0x180]\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v19.4s, v12.4s\n"
"ldr q8, [x7, x2]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v19.4s, v22.4s\n"
"ldr q10, [x7, x11]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v19.4s, v17.4s\n"
"ldr q12, [x4, x13]\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
+ "fmla v28.4s, v19.4s, v16.4s\n"
"ldr q9, [x4, x6]\n"
"ldr q4, [x8, #0x190]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"add x8, x8, #0x1a0\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x5]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x5, x3]\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "st1 { v30.4s }, [x5]\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "str q31, [x5, x3]\n"
"add x5, x5, #0x10\n"
- "st1 { v30.4s }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "st1 { v29.4s }, [x10]\n"
+ "str q28, [x10, x3]\n"
"add x10, x10, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x7, x15]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x8, #0x0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x7, x13]\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q22, [x7, x15]\n"
+ "mov v5.16b, v25.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v5.4s, v1.4s, v9.4s\n"
"add x7, x7, #0x10\n"
"fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x8, #0x10]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x4, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "add x4, x4, #0x10\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x8, #0x20]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x17, x2]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x8, #0x30]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x17, x6]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x17, x15]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x40]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x8, #0x50]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x17, x11]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x8, #0x60]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x17, x13]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
- "add x17, x17, #0x10\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr q18, [x8, #0x10]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x8, #0x70]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ld1 { v5.4s }, [x16]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x8, #0x80]\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x16, x2]\n"
+ "ldr q16, [x4, x11]\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
+ "add x4, x4, #0x10\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v22.4s\n"
+ "ldr q17, [x8, #0x20]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v5.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v21.4s\n"
+ "ldr q20, [x8, #0x30]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v5.4s, v4.4s, v16.4s\n"
+ "ldr q28, [x17, x15]\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x16, x6]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x8, #0x90]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x16, x11]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x8, #0xa0]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x16, x15]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x8, #0xb0]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x16, x13]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q16, [x8, #0x40]\n"
+ "fmla v31.4s, v19.4s, v7.4s\n"
+ "fmla v5.4s, v19.4s, v8.4s\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "fmla v29.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v31.4s, v18.4s, v8.4s\n"
+ "ldr q1, [x17, x11]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "fmla v29.4s, v18.4s, v2.4s\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v31.4s, v17.4s, v13.4s\n"
+ "ldr q26, [x17, x13]\n"
+ "fmla v5.4s, v17.4s, v22.4s\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "fmla v29.4s, v17.4s, v28.4s\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ld1 { v25.4s }, [x16]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
+ "fmla v30.4s, v20.4s, v28.4s\n"
+ "fmla v29.4s, v20.4s, v26.4s\n"
+ "ldr q24, [x8, #0x80]\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v5.4s, v16.4s, v10.4s\n"
+ "ldr q0, [x16, x6]\n"
+ "fmla v30.4s, v16.4s, v26.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q22, [x8, #0x90]\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v5.4s, v19.4s, v6.4s\n"
+ "fmla v30.4s, v19.4s, v25.4s\n"
+ "fmla v29.4s, v19.4s, v23.4s\n"
+ "ldr q21, [x8, #0xa0]\n"
+ "fmla v31.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "fmla v29.4s, v18.4s, v0.4s\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v31.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
"add x16, x16, #0x10\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x8, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ld1 { v9.4s }, [x14]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x8, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x14, x2]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x14, x13]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x8, #0xe0]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x14, x6]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x8, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x14, x15]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x8, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x14, x11]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "fmla v29.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v31.4s, v24.4s, v28.4s\n"
+ "ld1 { v7.4s }, [x14]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
+ "fmla v30.4s, v24.4s, v20.4s\n"
+ "fmla v29.4s, v24.4s, v19.4s\n"
+ "ldr q2, [x8, #0xd0]\n"
+ "fmla v31.4s, v22.4s, v26.4s\n"
+ "ldr q28, [x14, x2]\n"
+ "fmla v5.4s, v22.4s, v1.4s\n"
+ "ldr q13, [x14, x13]\n"
+ "fmla v30.4s, v22.4s, v19.4s\n"
+ "fmla v29.4s, v22.4s, v16.4s\n"
+ "ldr q14, [x8, #0xe0]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q26, [x14, x6]\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v29.4s, v21.4s, v28.4s\n"
+ "ldr q25, [x8, #0xf0]\n"
+ "fmla v31.4s, v18.4s, v23.4s\n"
+ "ldr q24, [x14, x15]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
+ "fmla v30.4s, v18.4s, v28.4s\n"
+ "fmla v29.4s, v18.4s, v26.4s\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v5.4s, v17.4s, v20.4s\n"
"add x14, x14, #0x10\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x8, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x8, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x12, x2]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x8, #0x130]\n"
+ "fmla v30.4s, v17.4s, v26.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v31.4s, v2.4s, v20.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v5.4s, v2.4s, v19.4s\n"
+ "fmla v30.4s, v2.4s, v24.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v31.4s, v14.4s, v19.4s\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v5.4s, v14.4s, v16.4s\n"
+ "fmla v30.4s, v14.4s, v13.4s\n"
+ "fmla v29.4s, v14.4s, v22.4s\n"
+ "ldr q19, [x8, #0x130]\n"
"add x8, x8, #0x140\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x12, x6]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x12, x15]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x12, x13]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x11]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v25.4s, v7.4s\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v5.4s, v25.4s, v28.4s\n"
+ "fmla v30.4s, v25.4s, v18.4s\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v29.4s, v25.4s, v17.4s\n"
+ "fmla v31.4s, v23.4s, v28.4s\n"
+ "fmla v5.4s, v23.4s, v26.4s\n"
+ "fmla v30.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v5.4s, v21.4s, v24.4s\n"
+ "fmla v30.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
"add x12, x12, #0x10\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x5]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x5, x3]\n"
+ "fmla v31.4s, v20.4s, v24.4s\n"
+ "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v30.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v5.4s, v19.4s, v22.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmla v30.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v5.4s, v5.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v5.4s, v5.4s, v15.4s\n"
+ "st1 { v31.4s }, [x5]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q5, [x5, x3]\n"
"add x5, x5, #0x10\n"
"st1 { v30.4s }, [x10]\n"
- "str q31, [x10, x3]\n"
+ "str q29, [x10, x3]\n"
"add x10, x10, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 61f\n"
- "ldr q16, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x0]\n"
"ldr q0, [x8, #0x10]\n"
"add x9, x4, XZR\n"
"add x28, x4, x2\n"
@@ -561,11 +561,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s10, [x21, #0x0]\n"
"ldr s14, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v6.4s\n"
"add x20, x7, x15\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
"fmla v30.4s, v1.4s, v8.4s\n"
@@ -934,14 +934,14 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s9, [x20, #0x0]\n"
"58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
"fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 59f\n"
"mov x21, x5\n"
"mov x20, x10\n"
@@ -967,7 +967,6 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"60:" // Tile loop: Oddments: Store: Bit 1: End
-
"61:" // Tile loop: End
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
@@ -982,7 +981,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 860adac326..32939eb6dc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -103,16 +103,16 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x9, %x[n_channels], #0x2\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ldp x14, x13, [x21, #0x0]\n"
"ldp x12, x11, [x21, #0x10]\n"
"mov x10, #0x0\n"
"sub x28, XZR, x17\n"
"cbz x9, 3f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
"cmp x17, x9, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
@@ -120,436 +120,436 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldr q5, [x27, x10]\n"
- "ldr q6, [x26, x10]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x10]\n"
- "ldr q8, [x24, x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q9, [x23, x10]\n"
- "ldr q13, [x22, x10]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q5, [x21, x10]\n"
+ "ldr q6, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x10]\n"
+ "ldr q8, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q9, [x21, x10]\n"
+ "ldr q13, [x20, x10]\n"
"ldp x21, x20, [x15, #0x30]\n"
"ldr q11, [x21, x10]\n"
"ldr q12, [x20, x10]\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x10]\n"
- "ldr q14, [x26, x10]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x10]\n"
+ "ldr q14, [x20, x10]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr q16, [x16, #0x140]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr q6, [x24, x10]\n"
- "fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x23, [x15, #0xa0]\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q24, [x20, x10]\n"
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q23, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x140]\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr q22, [x20, x10]\n"
"fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x16, #0x80]\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x16, #0xb0]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
+ "ldr q21, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q17, [x20, x10]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr x27, [x15, #0x100]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v24.4s\n"
+ "ldr q16, [x16, #0x20]\n"
+ "ldr x22, [x15, #0x70]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q5, [x20, x10]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x16, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x23, x10]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v3.4s, v22.4s\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x21, [x15, #0x80]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
+ "ldr q19, [x22, x10]\n"
+ "fmla v31.4s, v4.4s, v17.4s\n"
+ "ldr q2, [x20, x10]\n"
+ "fmla v28.4s, v4.4s, v22.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "ldr q18, [x16, #0x40]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v30.4s, v23.4s, v7.4s\n"
+ "fmla v31.4s, v23.4s, v8.4s\n"
+ "ldr x23, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
+ "fmla v28.4s, v23.4s, v14.4s\n"
+ "fmla v29.4s, v23.4s, v5.4s\n"
+ "ldr q1, [x16, #0x50]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v30.4s, v21.4s, v8.4s\n"
+ "ldr q25, [x20, x10]\n"
+ "fmla v31.4s, v21.4s, v13.4s\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v28.4s, v21.4s, v5.4s\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "ldr q17, [x16, #0x60]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.4s, v16.4s, v13.4s\n"
+ "ldr q8, [x21, x10]\n"
+ "fmla v31.4s, v16.4s, v24.4s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v28.4s, v16.4s, v19.4s\n"
+ "fmla v29.4s, v16.4s, v2.4s\n"
+ "ldr q16, [x16, #0x70]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v30.4s, v20.4s, v24.4s\n"
+ "ldr q24, [x23, x10]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ldr x27, [x15, #0xc8]\n"
+ "fmla v28.4s, v20.4s, v2.4s\n"
+ "fmla v29.4s, v20.4s, v8.4s\n"
+ "ldr q23, [x16, #0x80]\n"
+ "ldr x23, [x15, #0xd0]\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q22, [x26, x10]\n"
+ "fmla v31.4s, v18.4s, v10.4s\n"
+ "ldr q21, [x22, x10]\n"
+ "fmla v28.4s, v18.4s, v8.4s\n"
+ "fmla v29.4s, v18.4s, v25.4s\n"
+ "ldr q20, [x16, #0x90]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v30.4s, v1.4s, v14.4s\n"
+ "ldr q0, [x20, x10]\n"
"fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x130]\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v28.4s, v1.4s, v24.4s\n"
+ "fmla v29.4s, v1.4s, v22.4s\n"
+ "ldr q6, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v30.4s, v17.4s, v5.4s\n"
+ "ldr q1, [x25, x10]\n"
+ "fmla v31.4s, v17.4s, v19.4s\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v28.4s, v17.4s, v22.4s\n"
+ "fmla v29.4s, v17.4s, v21.4s\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.4s, v16.4s, v19.4s\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v31.4s, v16.4s, v2.4s\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v28.4s, v16.4s, v21.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v30.4s, v23.4s, v2.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.4s, v23.4s, v8.4s\n"
+ "ldr x21, [x15, #0x100]\n"
+ "fmla v28.4s, v23.4s, v1.4s\n"
+ "fmla v29.4s, v23.4s, v19.4s\n"
+ "ldr q13, [x16, #0xd0]\n"
+ "fmla v30.4s, v20.4s, v8.4s\n"
+ "ldr q2, [x27, x10]\n"
+ "fmla v31.4s, v20.4s, v25.4s\n"
+ "ldr q10, [x20, x10]\n"
+ "fmla v28.4s, v20.4s, v19.4s\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "ldr q9, [x16, #0xe0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v30.4s, v6.4s, v24.4s\n"
+ "ldr q5, [x23, x10]\n"
+ "fmla v31.4s, v6.4s, v22.4s\n"
+ "ldr x23, [x15, #0x110]\n"
+ "fmla v28.4s, v6.4s, v16.4s\n"
+ "fmla v29.4s, v6.4s, v2.4s\n"
+ "ldr q24, [x16, #0xf0]\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q25, [x22, x10]\n"
+ "fmla v31.4s, v18.4s, v21.4s\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v28.4s, v18.4s, v2.4s\n"
+ "fmla v29.4s, v18.4s, v5.4s\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v30.4s, v17.4s, v21.4s\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v31.4s, v17.4s, v1.4s\n"
+ "fmla v28.4s, v17.4s, v5.4s\n"
+ "fmla v29.4s, v17.4s, v25.4s\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v30.4s, v13.4s, v1.4s\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v31.4s, v13.4s, v19.4s\n"
+ "fmla v28.4s, v13.4s, v25.4s\n"
+ "fmla v29.4s, v13.4s, v10.4s\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v30.4s, v9.4s, v19.4s\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v31.4s, v9.4s, v0.4s\n"
+ "fmla v28.4s, v9.4s, v10.4s\n"
+ "fmla v29.4s, v9.4s, v22.4s\n"
+ "ldr q19, [x16, #0x130]\n"
+ "fmla v30.4s, v24.4s, v16.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.4s, v24.4s, v2.4s\n"
+ "fmla v28.4s, v24.4s, v18.4s\n"
+ "ldr q18, [x20, x10]\n"
+ "fmla v29.4s, v24.4s, v17.4s\n"
"ldr q0, [x16, #0x150]\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v23.4s, v2.4s\n"
+ "fmla v31.4s, v23.4s, v5.4s\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "fmla v28.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x23, x10]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
"ldr q1, [x16, #0x160]\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
- "ldr q5, [x27, x17]\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v21.4s, v5.4s\n"
+ "ldr q5, [x21, x17]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "fmla v28.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x22, x10]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
"ldr q2, [x16, #0x170]\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
- "ldr q6, [x26, x17]\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr q7, [x25, x17]\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v20.4s, v25.4s\n"
+ "ldr q6, [x20, x17]\n"
+ "fmla v31.4s, v20.4s, v10.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x17]\n"
+ "fmla v28.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
"ldr q3, [x16, #0x180]\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
- "ldr q8, [x24, x17]\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldr q13, [x22, x17]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
- "ldr q9, [x23, x17]\n"
+ "fmla v30.4s, v19.4s, v10.4s\n"
+ "ldr q8, [x20, x17]\n"
+ "fmla v31.4s, v19.4s, v22.4s\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x17]\n"
+ "fmla v28.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "ldr q9, [x21, x17]\n"
"ldr q4, [x16, #0x190]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"ldr q11, [x21, x17]\n"
"ldr q12, [x20, x17]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "ldp x27, x26, [x15, #0x40]\n"
- "ldr q10, [x27, x17]\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "ldr q14, [x26, x17]\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x17]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "ldr q14, [x20, x17]\n"
"add x17, x17, #0x10\n"
"cmp x17, x9, LSL #4\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
"add x10, x10, #0x10\n"
- "str q28, [x14, x28]\n"
+ "str q30, [x14, x28]\n"
"add x16, x16, #0x1a0\n"
- "str q29, [x13, x28]\n"
- "str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q31, [x13, x28]\n"
+ "str q28, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x50]\n"
- "ldr q5, [x25, x10]\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v1.4s, v9.4s\n"
- "ldr x23, [x15, #0x60]\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "mov v5.16b, v26.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q22, [x20, x10]\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x20, x10]\n"
+ "fmla v5.4s, v1.4s, v9.4s\n"
+ "ldr x21, [x15, #0x60]\n"
"fmla v30.4s, v1.4s, v8.4s\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla v28.4s, v2.4s, v9.4s\n"
- "ldr q9, [x23, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "ldr x27, [x15, #0x80]\n"
- "fmla v30.4s, v3.4s, v5.4s\n"
- "fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x16, #0x30]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q9, [x20, x10]\n"
- "fmla v30.4s, v4.4s, v6.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x40]\n"
- "ldr x25, [x15, #0x90]\n"
- "fmla v28.4s, v0.4s, v7.4s\n"
- "fmla v29.4s, v0.4s, v8.4s\n"
- "ldr x24, [x15, #0x98]\n"
- "ldr x23, [x15, #0xa0]\n"
- "fmla v30.4s, v0.4s, v14.4s\n"
- "fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x16, #0x50]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "ldr q8, [x26, x10]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x16, #0x60]\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q13, [x27, x10]\n"
- "fmla v29.4s, v2.4s, v5.4s\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr q18, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x16, #0x70]\n"
- "ldr x26, [x15, #0xc8]\n"
- "fmla v28.4s, v3.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v3.4s, v6.4s\n"
- "ldr x25, [x15, #0xd0]\n"
- "fmla v30.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x16, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v28.4s, v4.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
+ "ldr x23, [x15, #0x70]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v22.4s\n"
+ "ldr q17, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x20, x10]\n"
+ "fmla v5.4s, v3.4s, v12.4s\n"
+ "ldr x22, [x15, #0x80]\n"
+ "fmla v30.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v21.4s\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x23, x10]\n"
+ "fmla v5.4s, v4.4s, v16.4s\n"
+ "ldr q28, [x21, x10]\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q10, [x23, x10]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x16, #0x90]\n"
- "ldr x24, [x15, #0xd8]\n"
- "fmla v28.4s, v0.4s, v14.4s\n"
- "ldr q14, [x20, x10]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr x23, [x15, #0xe0]\n"
- "fmla v30.4s, v0.4s, v5.4s\n"
- "fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x16, #0xa0]\n"
- "ldr x20, [x15, #0xf8]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x22, x10]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr x22, [x15, #0xe8]\n"
- "fmla v30.4s, v1.4s, v6.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x16, #0xb0]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q12, [x21, x10]\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x21, [x15, #0xf0]\n"
- "fmla v30.4s, v2.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x16, #0xc0]\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr x27, [x15, #0x100]\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
+ "ldr q16, [x16, #0x40]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "fmla v31.4s, v19.4s, v7.4s\n"
+ "fmla v5.4s, v19.4s, v8.4s\n"
+ "ldr x27, [x15, #0x98]\n"
+ "ldr x26, [x15, #0xa0]\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "fmla v29.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x16, #0x50]\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v31.4s, v18.4s, v8.4s\n"
+ "ldr q1, [x20, x10]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "fmla v29.4s, v18.4s, v2.4s\n"
+ "ldr q18, [x16, #0x60]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.4s, v17.4s, v13.4s\n"
+ "ldr q26, [x22, x10]\n"
+ "fmla v5.4s, v17.4s, v22.4s\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "fmla v29.4s, v17.4s, v28.4s\n"
+ "ldr q17, [x16, #0x70]\n"
+ "ldr x22, [x15, #0xc8]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ldr q25, [x21, x10]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla v30.4s, v20.4s, v28.4s\n"
+ "fmla v29.4s, v20.4s, v26.4s\n"
+ "ldr q24, [x16, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "ldr q23, [x27, x10]\n"
+ "fmla v5.4s, v16.4s, v10.4s\n"
+ "ldr q0, [x26, x10]\n"
+ "fmla v30.4s, v16.4s, v26.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q22, [x16, #0x90]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v5.4s, v19.4s, v6.4s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v30.4s, v19.4s, v25.4s\n"
+ "fmla v29.4s, v19.4s, v23.4s\n"
+ "ldr q21, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v31.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x25, x10]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "fmla v29.4s, v18.4s, v0.4s\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "fmla v31.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "fmla v29.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v31.4s, v24.4s, v28.4s\n"
+ "ldr q7, [x23, x10]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
+ "ldr x23, [x15, #0x100]\n"
+ "fmla v30.4s, v24.4s, v20.4s\n"
+ "fmla v29.4s, v24.4s, v19.4s\n"
"ldr q3, [x16, #0xd0]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x26, x10]\n"
- "fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q8, [x23, x10]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x16, #0xe0]\n"
- "ldr x26, [x15, #0x108]\n"
- "fmla v28.4s, v0.4s, v5.4s\n"
- "ldr q5, [x25, x10]\n"
- "fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x25, [x15, #0x110]\n"
- "fmla v30.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x16, #0xf0]\n"
- "fmla v28.4s, v1.4s, v6.4s\n"
- "ldr q6, [x24, x10]\n"
- "fmla v29.4s, v1.4s, v10.4s\n"
- "ldr x24, [x15, #0x118]\n"
- "fmla v30.4s, v1.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x16, #0x100]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "ldr q10, [x22, x10]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v2.4s, v5.4s\n"
- "fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x16, #0x110]\n"
- "fmla v28.4s, v3.4s, v11.4s\n"
- "ldr q11, [x21, x10]\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v6.4s\n"
- "fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x16, #0x120]\n"
- "fmla v28.4s, v4.4s, v12.4s\n"
- "ldr q12, [x20, x10]\n"
- "fmla v29.4s, v4.4s, v14.4s\n"
- "fmla v30.4s, v4.4s, v8.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x16, #0x130]\n"
+ "fmla v31.4s, v22.4s, v26.4s\n"
+ "ldr q28, [x22, x10]\n"
+ "fmla v5.4s, v22.4s, v1.4s\n"
+ "ldr q13, [x20, x10]\n"
+ "fmla v30.4s, v22.4s, v19.4s\n"
+ "fmla v29.4s, v22.4s, v16.4s\n"
+ "ldr q11, [x16, #0xe0]\n"
+ "ldr x22, [x15, #0x108]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q26, [x21, x10]\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "ldr x21, [x15, #0x110]\n"
+ "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v29.4s, v21.4s, v28.4s\n"
+ "ldr q25, [x16, #0xf0]\n"
+ "fmla v31.4s, v18.4s, v23.4s\n"
+ "ldr q24, [x27, x10]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v30.4s, v18.4s, v28.4s\n"
+ "fmla v29.4s, v18.4s, v26.4s\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v5.4s, v17.4s, v20.4s\n"
+ "fmla v30.4s, v17.4s, v26.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v31.4s, v3.4s, v20.4s\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v5.4s, v3.4s, v19.4s\n"
+ "fmla v30.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v31.4s, v11.4s, v19.4s\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v5.4s, v11.4s, v16.4s\n"
+ "fmla v30.4s, v11.4s, v13.4s\n"
+ "fmla v29.4s, v11.4s, v22.4s\n"
+ "ldr q19, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
- "fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x27, x10]\n"
- "fmla v29.4s, v0.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v11.4s\n"
- "ldr q11, [x26, x10]\n"
- "fmla v31.4s, v0.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v13.4s\n"
- "fmla v29.4s, v1.4s, v5.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x25, x10]\n"
- "fmla v31.4s, v1.4s, v9.4s\n"
- "fmla v28.4s, v2.4s, v5.4s\n"
- "fmla v29.4s, v2.4s, v6.4s\n"
- "fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q9, [x24, x10]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v25.4s, v7.4s\n"
+ "ldr q16, [x23, x10]\n"
+ "fmla v5.4s, v25.4s, v28.4s\n"
+ "fmla v30.4s, v25.4s, v18.4s\n"
+ "ldr q18, [x22, x10]\n"
+ "fmla v29.4s, v25.4s, v17.4s\n"
+ "fmla v31.4s, v23.4s, v28.4s\n"
+ "fmla v5.4s, v23.4s, v26.4s\n"
+ "fmla v30.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x21, x10]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v5.4s, v21.4s, v24.4s\n"
+ "fmla v30.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
"add x10, x10, #0x10\n"
- "fmla v28.4s, v3.4s, v6.4s\n"
- "fmla v29.4s, v3.4s, v8.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v31.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v4.4s, v8.4s\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x14, x28]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x13, x28]\n"
+ "fmla v31.4s, v20.4s, v24.4s\n"
+ "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v30.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v5.4s, v19.4s, v22.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmla v30.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v5.4s, v5.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v5.4s, v5.4s, v15.4s\n"
+ "str q31, [x14, x28]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q5, [x13, x28]\n"
"str q30, [x12, x28]\n"
- "str q31, [x11, x28]\n"
+ "str q29, [x11, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 60f\n"
- "ldr q16, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "mov x28, x10\n"
- "add x14, x14, x28\n"
+ "mov x20, x10\n"
+ "add x14, x14, x20\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "add x11, x11, x28\n"
+ "add x11, x11, x20\n"
"ldr x9, [x15, #0x0]\n"
"ldr x28, [x15, #0x8]\n"
"add x9, x9, x10\n"
@@ -606,12 +606,12 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v10.s }[0], [x21], #0x4\n"
"ld1 { v14.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v6.4s\n"
"ldr x20, [x15, #0x50]\n"
"add x20, x20, x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
- "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
"fmla v30.4s, v1.4s, v8.4s\n"
@@ -1005,14 +1005,14 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v9.s }[0], [x20], #0x4\n"
"57:" // Oddments: Load input (5, 5): Bit 1: End
"fmla v31.4s, v4.4s, v9.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 58f\n"
"st1 { v28.d }[0], [x14], #0x8\n"
"st1 { v29.d }[0], [x13], #0x8\n"
@@ -1030,12 +1030,10 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"st1 { v30.s }[0], [x12], #0x4\n"
"st1 { v31.s }[0], [x11], #0x4\n"
"59:" // Oddments: Store: Bit 1: End
-
"60:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
index 6fa02b781e..8a8060770c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -48,4 +48,4 @@ class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKer
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 0ea3a8fbed..a2f577784f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -44,70 +45,70 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v2.4s }, [%x[minmax_vals]]\n"
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[minmax_vals], #0x4\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 5f\n"
+ "cbz x9, 5f\n"
"1:" // Channel loop
"movi v23.16b, #0x0\n"
"cbz %x[bias], 2f\n"
"ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr q14, [x10, x11]\n"
- "ldr q15, [x9, x11]\n"
+ "mov x26, %x[inptrs]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr q16, [x28, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr q17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr q18, [x26, x11]\n"
- "ldr q19, [x25, x11]\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr q20, [x24, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
- "subs x20, x20, #0x1\n"
+ "ldp x20, x24, [x26], #0x10\n"
+ "ldp x23, x22, [x26], #0x10\n"
+ "subs x25, x25, #0x1\n"
"fmla v23.4s, v14.4s, v0.4s\n"
- "ldr q14, [x10, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr q14, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v24.4s, v15.4s, v0.4s\n"
"fmla v25.4s, v16.4s, v0.4s\n"
- "ldr q15, [x9, x11]\n"
- "ldr q16, [x28, x11]\n"
+ "ldr q15, [x24, x11]\n"
+ "ldr q16, [x23, x11]\n"
"fmla v26.4s, v17.4s, v0.4s\n"
"fmla v27.4s, v18.4s, v0.4s\n"
- "ldr q17, [x27, x11]\n"
- "ldr q18, [x26, x11]\n"
+ "ldr q17, [x22, x11]\n"
+ "ldr q18, [x21, x11]\n"
"fmla v28.4s, v19.4s, v0.4s\n"
"fmla v29.4s, v20.4s, v0.4s\n"
- "ldr q19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
"fmla v30.4s, v21.4s, v0.4s\n"
"fmla v31.4s, v22.4s, v0.4s\n"
"ldr q0, [%x[params], #0x0]\n"
- "ldr q20, [x24, x11]\n"
+ "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
- "ldr q22, [x22, x11]\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla v23.4s, v14.4s, v0.4s\n"
@@ -152,7 +153,7 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"str q30, [x21, x11]\n"
"str q31, [x20, x11]\n"
"add x11, x11, #0x10\n"
- "cmp x11, x12, LSL #4\n"
+ "cmp x11, x9, LSL #4\n"
"blt 1b\n"
"5:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -170,121 +171,121 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"7:" // Oddments: Load bias: Bit 1: End
"8:" // Oddments: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v31.16b, v23.16b\n"
"add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"b 10f\n"
"9:" // Oddments: Load: Bit 1: Unset
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"10:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"ble 14f\n"
"11:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"fmla v23.4s, v14.4s, v0.4s\n"
"fmla v24.4s, v15.4s, v0.4s\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"fmla v25.4s, v16.4s, v0.4s\n"
"fmla v26.4s, v17.4s, v0.4s\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr x21, [x10], #0x8\n"
"fmla v27.4s, v18.4s, v0.4s\n"
"fmla v28.4s, v19.4s, v0.4s\n"
- "add x10, x10, x11\n"
+ "add x9, x9, x11\n"
"fmla v29.4s, v20.4s, v0.4s\n"
"fmla v30.4s, v21.4s, v0.4s\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"fmla v31.4s, v22.4s, v0.4s\n"
"ldr q0, [%x[params], #0x0]\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 12f\n"
- "ldr d14, [x10], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d19, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d21, [x23], #0x8\n"
- "ldr d22, [x22], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v14.s }[2], [x10], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v19.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v21.s }[2], [x23], #0x4\n"
- "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"b 13f\n"
"12:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr s14, [x10], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s19, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s21, [x23], #0x4\n"
- "ldr s22, [x22], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"13:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"bgt 11b\n"
@@ -365,10 +366,11 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"17:" // End
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
index 2ec0525226..6c07fa645c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -58,4 +58,4 @@ struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) \ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 69b3865a65..9cafd23fb8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v24.4s }, [%x[clamps]]\n"
+ "ld1r { v27.4s }, [%x[clamps]]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
"lsr x22, %x[channel_multiplier], #0x2\n"
"add x20, %x[clamps], #0x4\n"
@@ -49,7 +50,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ldr q1, [x21, #0x10]\n"
"mov x21, #0x0\n"
"mov x14, #0x0\n"
- "ld1r { v23.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ldr q2, [x20, #0x0]\n"
"ldr q3, [x20, #0x10]\n"
@@ -101,7 +102,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v31.4s, v8.s[0]\n"
"fmla v21.4s, v31.4s, v8.s[2]\n"
"fmla v22.4s, v31.4s, v9.s[0]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x0]\n"
"fmla v14.4s, v30.4s, v0.s[1]\n"
"fmla v15.4s, v30.4s, v0.s[3]\n"
"fmla v16.4s, v30.4s, v1.s[1]\n"
@@ -111,7 +112,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v30.4s, v8.s[1]\n"
"fmla v21.4s, v30.4s, v8.s[3]\n"
"fmla v22.4s, v30.4s, v9.s[1]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x10]\n"
"fmla v14.4s, v29.4s, v0.s[2]\n"
"fmla v15.4s, v29.4s, v1.s[0]\n"
"fmla v16.4s, v29.4s, v1.s[2]\n"
@@ -121,92 +122,92 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v29.4s, v8.s[2]\n"
"fmla v21.4s, v29.4s, v9.s[0]\n"
"fmla v22.4s, v29.4s, v9.s[2]\n"
- "ldr q29, [%x[params], #0x20]\n"
- "fmla v14.4s, v31.4s, v2.s[0]\n"
- "fmla v15.4s, v31.4s, v2.s[2]\n"
- "fmla v16.4s, v31.4s, v3.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[0]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v7.s[0]\n"
- "fmla v20.4s, v31.4s, v10.s[0]\n"
- "fmla v21.4s, v31.4s, v10.s[2]\n"
- "fmla v22.4s, v31.4s, v11.s[0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "fmla v14.4s, v30.4s, v2.s[1]\n"
- "fmla v15.4s, v30.4s, v2.s[3]\n"
- "fmla v16.4s, v30.4s, v3.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[1]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[1]\n"
- "fmla v20.4s, v30.4s, v10.s[1]\n"
- "fmla v21.4s, v30.4s, v10.s[3]\n"
- "fmla v22.4s, v30.4s, v11.s[1]\n"
- "ldr q30, [%x[params], #0x40]\n"
- "fmla v14.4s, v29.4s, v2.s[2]\n"
- "fmla v15.4s, v29.4s, v3.s[0]\n"
- "fmla v16.4s, v29.4s, v3.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[2]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[2]\n"
- "fmla v20.4s, v29.4s, v10.s[2]\n"
- "fmla v21.4s, v29.4s, v11.s[0]\n"
- "fmla v22.4s, v29.4s, v11.s[2]\n"
- "ldr q29, [%x[params], #0x50]\n"
- "fmla v14.4s, v31.4s, v4.s[0]\n"
- "fmla v15.4s, v31.4s, v4.s[2]\n"
- "fmla v16.4s, v31.4s, v5.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[0]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v9.s[0]\n"
- "fmla v20.4s, v31.4s, v12.s[0]\n"
- "fmla v21.4s, v31.4s, v12.s[2]\n"
- "fmla v22.4s, v31.4s, v13.s[0]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x40]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x50]\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
"ldr q31, [%x[params], #0x70]\n"
- "fmla v14.4s, v30.4s, v4.s[1]\n"
- "fmla v15.4s, v30.4s, v4.s[3]\n"
- "fmla v16.4s, v30.4s, v5.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[1]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[1]\n"
- "fmla v20.4s, v30.4s, v12.s[1]\n"
- "fmla v21.4s, v30.4s, v12.s[3]\n"
- "fmla v22.4s, v30.4s, v13.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
"ldr q30, [%x[params], #0x80]\n"
- "fmla v14.4s, v29.4s, v4.s[2]\n"
- "fmla v15.4s, v29.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmla v16.4s, v29.4s, v5.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[2]\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
"str q14, [x13, x14]\n"
"ldr q14, [%x[params], #0x60]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[2]\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmla v20.4s, v29.4s, v12.s[2]\n"
- "fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
"ldr q29, [%x[params], #0x90]\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
"add %x[params], %x[params], #0xa0\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
"str q15, [x12, x14]\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
"str q16, [x11, x14]\n"
- "fmax v18.4s, v18.4s, v24.4s\n"
- "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
"str q17, [x10, x14]\n"
- "fmax v20.4s, v20.4s, v24.4s\n"
- "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"str q18, [x9, x14]\n"
- "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
"str q19, [x28, x14]\n"
"mov v15.16b, v14.16b\n"
"str q20, [x27, x14]\n"
@@ -231,7 +232,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v31.4s, v8.s[0]\n"
"fmla v21.4s, v31.4s, v8.s[2]\n"
"fmla v22.4s, v31.4s, v9.s[0]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x0]\n"
"fmla v14.4s, v30.4s, v0.s[1]\n"
"fmla v15.4s, v30.4s, v0.s[3]\n"
"fmla v16.4s, v30.4s, v1.s[1]\n"
@@ -241,7 +242,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v30.4s, v8.s[1]\n"
"fmla v21.4s, v30.4s, v8.s[3]\n"
"fmla v22.4s, v30.4s, v9.s[1]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x10]\n"
"fmla v14.4s, v29.4s, v0.s[2]\n"
"fmla v15.4s, v29.4s, v1.s[0]\n"
"fmla v16.4s, v29.4s, v1.s[2]\n"
@@ -251,87 +252,87 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v20.4s, v29.4s, v8.s[2]\n"
"fmla v21.4s, v29.4s, v9.s[0]\n"
"fmla v22.4s, v29.4s, v9.s[2]\n"
- "ldr q29, [%x[params], #0x20]\n"
- "fmla v14.4s, v31.4s, v2.s[0]\n"
- "fmla v15.4s, v31.4s, v2.s[2]\n"
- "fmla v16.4s, v31.4s, v3.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[0]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v7.s[0]\n"
- "fmla v20.4s, v31.4s, v10.s[0]\n"
- "fmla v21.4s, v31.4s, v10.s[2]\n"
- "fmla v22.4s, v31.4s, v11.s[0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "fmla v14.4s, v30.4s, v2.s[1]\n"
- "fmla v15.4s, v30.4s, v2.s[3]\n"
- "fmla v16.4s, v30.4s, v3.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[1]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[1]\n"
- "fmla v20.4s, v30.4s, v10.s[1]\n"
- "fmla v21.4s, v30.4s, v10.s[3]\n"
- "fmla v22.4s, v30.4s, v11.s[1]\n"
- "ldr q30, [%x[params], #0x40]\n"
- "fmla v14.4s, v29.4s, v2.s[2]\n"
- "fmla v15.4s, v29.4s, v3.s[0]\n"
- "fmla v16.4s, v29.4s, v3.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[2]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[2]\n"
- "fmla v20.4s, v29.4s, v10.s[2]\n"
- "fmla v21.4s, v29.4s, v11.s[0]\n"
- "fmla v22.4s, v29.4s, v11.s[2]\n"
- "ldr q29, [%x[params], #0x50]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x40]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x60\n"
- "fmla v14.4s, v31.4s, v4.s[0]\n"
- "fmla v15.4s, v31.4s, v4.s[2]\n"
- "fmla v16.4s, v31.4s, v5.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[0]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v9.s[0]\n"
- "fmla v20.4s, v31.4s, v12.s[0]\n"
- "fmla v21.4s, v31.4s, v12.s[2]\n"
- "fmla v22.4s, v31.4s, v13.s[0]\n"
- "fmla v14.4s, v30.4s, v4.s[1]\n"
- "fmla v15.4s, v30.4s, v4.s[3]\n"
- "fmla v16.4s, v30.4s, v5.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[1]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[1]\n"
- "fmla v20.4s, v30.4s, v12.s[1]\n"
- "fmla v21.4s, v30.4s, v12.s[3]\n"
- "fmla v22.4s, v30.4s, v13.s[1]\n"
- "fmla v14.4s, v29.4s, v4.s[2]\n"
- "fmla v15.4s, v29.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmla v16.4s, v29.4s, v5.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[2]\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[2]\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmla v20.4s, v29.4s, v12.s[2]\n"
- "fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
"str q14, [x13, x14]\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
"str q15, [x12, x14]\n"
- "fmax v18.4s, v18.4s, v24.4s\n"
- "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
"str q16, [x11, x14]\n"
- "fmax v20.4s, v20.4s, v24.4s\n"
- "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
"str q17, [x10, x14]\n"
- "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
"str q18, [x9, x14]\n"
"str q19, [x28, x14]\n"
"str q20, [x27, x14]\n"
@@ -342,123 +343,123 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q14, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x10]\n"
+ "ldr q25, [%x[params], #0x10]\n"
"mov v15.16b, v14.16b\n"
"mov v16.16b, v14.16b\n"
- "ldr q30, [%x[params], #0x20]\n"
- "ldr q29, [%x[params], #0x30]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "ldr q23, [%x[params], #0x30]\n"
"mov v17.16b, v14.16b\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
"mov v20.16b, v14.16b\n"
- "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v25.4s, v0.s[2]\n"
"mov v21.16b, v14.16b\n"
"mov v22.16b, v14.16b\n"
- "fmla v14.4s, v31.4s, v0.s[0]\n"
- "fmla v16.4s, v31.4s, v1.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[0]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v5.s[0]\n"
- "fmla v20.4s, v31.4s, v8.s[0]\n"
- "fmla v21.4s, v31.4s, v8.s[2]\n"
- "fmla v22.4s, v31.4s, v9.s[0]\n"
- "ldr q31, [%x[params], #0x40]\n"
- "fmla v14.4s, v30.4s, v0.s[1]\n"
- "fmla v15.4s, v30.4s, v0.s[3]\n"
- "fmla v16.4s, v30.4s, v1.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[1]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[1]\n"
- "fmla v20.4s, v30.4s, v8.s[1]\n"
- "fmla v21.4s, v30.4s, v8.s[3]\n"
- "fmla v22.4s, v30.4s, v9.s[1]\n"
- "ldr q30, [%x[params], #0x50]\n"
- "fmla v14.4s, v29.4s, v0.s[2]\n"
- "fmla v15.4s, v29.4s, v1.s[0]\n"
- "fmla v16.4s, v29.4s, v1.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[2]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[2]\n"
- "fmla v20.4s, v29.4s, v8.s[2]\n"
- "fmla v21.4s, v29.4s, v9.s[0]\n"
- "fmla v22.4s, v29.4s, v9.s[2]\n"
- "ldr q29, [%x[params], #0x60]\n"
- "fmla v14.4s, v31.4s, v2.s[0]\n"
- "fmla v15.4s, v31.4s, v2.s[2]\n"
- "fmla v16.4s, v31.4s, v3.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[0]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v7.s[0]\n"
- "fmla v20.4s, v31.4s, v10.s[0]\n"
- "fmla v21.4s, v31.4s, v10.s[2]\n"
- "fmla v22.4s, v31.4s, v11.s[0]\n"
- "ldr q31, [%x[params], #0x70]\n"
- "fmla v14.4s, v30.4s, v2.s[1]\n"
- "fmla v15.4s, v30.4s, v2.s[3]\n"
- "fmla v16.4s, v30.4s, v3.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[1]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[1]\n"
- "fmla v20.4s, v30.4s, v10.s[1]\n"
- "fmla v21.4s, v30.4s, v10.s[3]\n"
- "fmla v22.4s, v30.4s, v11.s[1]\n"
- "ldr q30, [%x[params], #0x80]\n"
- "fmla v14.4s, v29.4s, v2.s[2]\n"
- "fmla v15.4s, v29.4s, v3.s[0]\n"
- "fmla v16.4s, v29.4s, v3.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[2]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[2]\n"
- "fmla v20.4s, v29.4s, v10.s[2]\n"
- "fmla v21.4s, v29.4s, v11.s[0]\n"
- "fmla v22.4s, v29.4s, v11.s[2]\n"
- "ldr q29, [%x[params], #0x90]\n"
+ "fmla v14.4s, v25.4s, v0.s[0]\n"
+ "fmla v16.4s, v25.4s, v1.s[0]\n"
+ "fmla v17.4s, v25.4s, v4.s[0]\n"
+ "fmla v18.4s, v25.4s, v4.s[2]\n"
+ "fmla v19.4s, v25.4s, v5.s[0]\n"
+ "fmla v20.4s, v25.4s, v8.s[0]\n"
+ "fmla v21.4s, v25.4s, v8.s[2]\n"
+ "fmla v22.4s, v25.4s, v9.s[0]\n"
+ "ldr q25, [%x[params], #0x40]\n"
+ "fmla v14.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v0.s[3]\n"
+ "fmla v16.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[3]\n"
+ "fmla v19.4s, v24.4s, v5.s[1]\n"
+ "fmla v20.4s, v24.4s, v8.s[1]\n"
+ "fmla v21.4s, v24.4s, v8.s[3]\n"
+ "fmla v22.4s, v24.4s, v9.s[1]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v14.4s, v23.4s, v0.s[2]\n"
+ "fmla v15.4s, v23.4s, v1.s[0]\n"
+ "fmla v16.4s, v23.4s, v1.s[2]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v5.s[0]\n"
+ "fmla v19.4s, v23.4s, v5.s[2]\n"
+ "fmla v20.4s, v23.4s, v8.s[2]\n"
+ "fmla v21.4s, v23.4s, v9.s[0]\n"
+ "fmla v22.4s, v23.4s, v9.s[2]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x70]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x80]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x90]\n"
"add %x[params], %x[params], #0xa0\n"
- "fmla v14.4s, v31.4s, v4.s[0]\n"
- "fmla v15.4s, v31.4s, v4.s[2]\n"
- "fmla v16.4s, v31.4s, v5.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[0]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v9.s[0]\n"
- "fmla v20.4s, v31.4s, v12.s[0]\n"
- "fmla v21.4s, v31.4s, v12.s[2]\n"
- "fmla v22.4s, v31.4s, v13.s[0]\n"
- "fmla v14.4s, v30.4s, v4.s[1]\n"
- "fmla v15.4s, v30.4s, v4.s[3]\n"
- "fmla v16.4s, v30.4s, v5.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[1]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[1]\n"
- "fmla v20.4s, v30.4s, v12.s[1]\n"
- "fmla v21.4s, v30.4s, v12.s[3]\n"
- "fmla v22.4s, v30.4s, v13.s[1]\n"
- "fmla v14.4s, v29.4s, v4.s[2]\n"
- "fmla v15.4s, v29.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmla v16.4s, v29.4s, v5.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[2]\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[2]\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmla v20.4s, v29.4s, v12.s[2]\n"
- "fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "fmax v17.4s, v17.4s, v24.4s\n"
- "fmax v18.4s, v18.4s, v24.4s\n"
- "fmax v19.4s, v19.4s, v24.4s\n"
- "fmax v20.4s, v20.4s, v24.4s\n"
- "fmax v21.4s, v21.4s, v24.4s\n"
- "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
"add x20, x13, x14\n"
"add x22, x12, x14\n"
@@ -519,15 +520,14 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"st1 { v21.s }[0], [x21]\n"
"st1 { v22.s }[0], [x20]\n"
"5:" // Output channel oddments: Store: Bit 1: End
-
"6:" // End
-
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
index 5ae8dd3653..9f514c78e7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -58,4 +58,4 @@ struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index 50848cc2e8..c9bb1f41da 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -42,7 +42,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v21.4s }, [%x[clamps]]\n"
+ "ld1r { v26.4s }, [%x[clamps]]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
"lsr x22, %x[channel_multiplier], #0x2\n"
"add x20, %x[clamps], #0x4\n"
@@ -50,7 +50,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ldr q1, [x21, #0x10]\n"
"mov x21, #0x0\n"
"mov x13, #0x0\n"
- "ld1r { v20.4s }, [x20]\n"
+ "ld1r { v25.4s }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ldr q2, [x20, #0x0]\n"
"ldr q3, [x20, #0x10]\n"
@@ -98,7 +98,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v31.4s, v2.s[1]\n"
"fmla v18.4s, v31.4s, v2.s[2]\n"
"fmla v19.4s, v31.4s, v2.s[3]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q24, [%x[params], #0x0]\n"
"fmla v12.4s, v30.4s, v0.s[1]\n"
"fmla v13.4s, v30.4s, v0.s[2]\n"
"fmla v14.4s, v30.4s, v0.s[3]\n"
@@ -107,7 +107,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v30.4s, v2.s[2]\n"
"fmla v18.4s, v30.4s, v2.s[3]\n"
"fmla v19.4s, v30.4s, v3.s[0]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q23, [%x[params], #0x10]\n"
"fmla v12.4s, v29.4s, v0.s[2]\n"
"fmla v13.4s, v29.4s, v0.s[3]\n"
"fmla v14.4s, v29.4s, v1.s[0]\n"
@@ -116,7 +116,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v29.4s, v2.s[3]\n"
"fmla v18.4s, v29.4s, v3.s[0]\n"
"fmla v19.4s, v29.4s, v3.s[1]\n"
- "ldr q29, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x20]\n"
"fmla v12.4s, v28.4s, v0.s[3]\n"
"fmla v13.4s, v28.4s, v1.s[0]\n"
"fmla v14.4s, v28.4s, v1.s[1]\n"
@@ -125,7 +125,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v28.4s, v3.s[0]\n"
"fmla v18.4s, v28.4s, v3.s[1]\n"
"fmla v19.4s, v28.4s, v3.s[2]\n"
- "ldr q28, [%x[params], #0x30]\n"
+ "ldr q21, [%x[params], #0x30]\n"
"fmla v12.4s, v27.4s, v1.s[0]\n"
"fmla v13.4s, v27.4s, v1.s[1]\n"
"fmla v14.4s, v27.4s, v1.s[2]\n"
@@ -134,209 +134,209 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v27.4s, v3.s[1]\n"
"fmla v18.4s, v27.4s, v3.s[2]\n"
"fmla v19.4s, v27.4s, v3.s[3]\n"
- "ldr q27, [%x[params], #0x40]\n"
- "fmla v12.4s, v31.4s, v2.s[0]\n"
- "fmla v13.4s, v31.4s, v2.s[1]\n"
- "fmla v14.4s, v31.4s, v2.s[2]\n"
- "fmla v15.4s, v31.4s, v2.s[3]\n"
- "fmla v16.4s, v31.4s, v4.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[1]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v4.s[3]\n"
- "ldr q31, [%x[params], #0x50]\n"
- "fmla v12.4s, v30.4s, v2.s[1]\n"
- "fmla v13.4s, v30.4s, v2.s[2]\n"
- "fmla v14.4s, v30.4s, v2.s[3]\n"
- "fmla v15.4s, v30.4s, v3.s[0]\n"
- "fmla v16.4s, v30.4s, v4.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[2]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[0]\n"
- "ldr q30, [%x[params], #0x60]\n"
- "fmla v12.4s, v29.4s, v2.s[2]\n"
- "fmla v13.4s, v29.4s, v2.s[3]\n"
- "fmla v14.4s, v29.4s, v3.s[0]\n"
- "fmla v15.4s, v29.4s, v3.s[1]\n"
- "fmla v16.4s, v29.4s, v4.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[3]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[1]\n"
- "ldr q29, [%x[params], #0x70]\n"
- "fmla v12.4s, v28.4s, v2.s[3]\n"
- "fmla v13.4s, v28.4s, v3.s[0]\n"
- "fmla v14.4s, v28.4s, v3.s[1]\n"
- "fmla v15.4s, v28.4s, v3.s[2]\n"
- "fmla v16.4s, v28.4s, v4.s[3]\n"
- "fmla v17.4s, v28.4s, v5.s[0]\n"
- "fmla v18.4s, v28.4s, v5.s[1]\n"
- "fmla v19.4s, v28.4s, v5.s[2]\n"
- "ldr q28, [%x[params], #0x80]\n"
- "fmla v12.4s, v27.4s, v3.s[0]\n"
- "fmla v13.4s, v27.4s, v3.s[1]\n"
- "fmla v14.4s, v27.4s, v3.s[2]\n"
- "fmla v15.4s, v27.4s, v3.s[3]\n"
- "fmla v16.4s, v27.4s, v5.s[0]\n"
- "fmla v17.4s, v27.4s, v5.s[1]\n"
- "fmla v18.4s, v27.4s, v5.s[2]\n"
- "fmla v19.4s, v27.4s, v5.s[3]\n"
- "ldr q27, [%x[params], #0x90]\n"
- "fmla v12.4s, v31.4s, v4.s[0]\n"
- "fmla v13.4s, v31.4s, v4.s[1]\n"
- "fmla v14.4s, v31.4s, v4.s[2]\n"
- "fmla v15.4s, v31.4s, v4.s[3]\n"
- "fmla v16.4s, v31.4s, v6.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[1]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v6.s[3]\n"
- "ldr q31, [%x[params], #0xa0]\n"
- "fmla v12.4s, v30.4s, v4.s[1]\n"
- "fmla v13.4s, v30.4s, v4.s[2]\n"
- "fmla v14.4s, v30.4s, v4.s[3]\n"
- "fmla v15.4s, v30.4s, v5.s[0]\n"
- "fmla v16.4s, v30.4s, v6.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[2]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[0]\n"
- "ldr q30, [%x[params], #0xb0]\n"
- "fmla v12.4s, v29.4s, v4.s[2]\n"
- "fmla v13.4s, v29.4s, v4.s[3]\n"
- "fmla v14.4s, v29.4s, v5.s[0]\n"
- "fmla v15.4s, v29.4s, v5.s[1]\n"
- "fmla v16.4s, v29.4s, v6.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[3]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[1]\n"
- "ldr q29, [%x[params], #0xc0]\n"
- "fmla v12.4s, v28.4s, v4.s[3]\n"
- "fmla v13.4s, v28.4s, v5.s[0]\n"
- "fmla v14.4s, v28.4s, v5.s[1]\n"
- "fmla v15.4s, v28.4s, v5.s[2]\n"
- "fmla v16.4s, v28.4s, v6.s[3]\n"
- "fmla v17.4s, v28.4s, v7.s[0]\n"
- "fmla v18.4s, v28.4s, v7.s[1]\n"
- "fmla v19.4s, v28.4s, v7.s[2]\n"
- "ldr q28, [%x[params], #0xd0]\n"
- "fmla v12.4s, v27.4s, v5.s[0]\n"
- "fmla v13.4s, v27.4s, v5.s[1]\n"
- "fmla v14.4s, v27.4s, v5.s[2]\n"
- "fmla v15.4s, v27.4s, v5.s[3]\n"
- "fmla v16.4s, v27.4s, v7.s[0]\n"
- "fmla v17.4s, v27.4s, v7.s[1]\n"
- "fmla v18.4s, v27.4s, v7.s[2]\n"
- "fmla v19.4s, v27.4s, v7.s[3]\n"
- "ldr q27, [%x[params], #0xe0]\n"
- "fmla v12.4s, v31.4s, v6.s[0]\n"
- "fmla v13.4s, v31.4s, v6.s[1]\n"
- "fmla v14.4s, v31.4s, v6.s[2]\n"
- "fmla v15.4s, v31.4s, v6.s[3]\n"
- "fmla v16.4s, v31.4s, v8.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[1]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v8.s[3]\n"
- "ldr q31, [%x[params], #0xf0]\n"
- "fmla v12.4s, v30.4s, v6.s[1]\n"
- "fmla v13.4s, v30.4s, v6.s[2]\n"
- "fmla v14.4s, v30.4s, v6.s[3]\n"
- "fmla v15.4s, v30.4s, v7.s[0]\n"
- "fmla v16.4s, v30.4s, v8.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[2]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[0]\n"
- "ldr q30, [%x[params], #0x100]\n"
- "fmla v12.4s, v29.4s, v6.s[2]\n"
- "fmla v13.4s, v29.4s, v6.s[3]\n"
- "fmla v14.4s, v29.4s, v7.s[0]\n"
- "fmla v15.4s, v29.4s, v7.s[1]\n"
- "fmla v16.4s, v29.4s, v8.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[3]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[1]\n"
- "ldr q29, [%x[params], #0x110]\n"
- "fmla v12.4s, v28.4s, v6.s[3]\n"
- "fmla v13.4s, v28.4s, v7.s[0]\n"
- "fmla v14.4s, v28.4s, v7.s[1]\n"
- "fmla v15.4s, v28.4s, v7.s[2]\n"
- "fmla v16.4s, v28.4s, v8.s[3]\n"
- "fmla v17.4s, v28.4s, v9.s[0]\n"
- "fmla v18.4s, v28.4s, v9.s[1]\n"
- "fmla v19.4s, v28.4s, v9.s[2]\n"
- "ldr q28, [%x[params], #0x120]\n"
- "fmla v12.4s, v27.4s, v7.s[0]\n"
- "fmla v13.4s, v27.4s, v7.s[1]\n"
- "fmla v14.4s, v27.4s, v7.s[2]\n"
- "fmla v15.4s, v27.4s, v7.s[3]\n"
- "fmla v16.4s, v27.4s, v9.s[0]\n"
- "fmla v17.4s, v27.4s, v9.s[1]\n"
- "fmla v18.4s, v27.4s, v9.s[2]\n"
- "fmla v19.4s, v27.4s, v9.s[3]\n"
- "ldr q27, [%x[params], #0x130]\n"
- "fmla v12.4s, v31.4s, v8.s[0]\n"
- "fmla v13.4s, v31.4s, v8.s[1]\n"
- "fmla v14.4s, v31.4s, v8.s[2]\n"
- "fmla v15.4s, v31.4s, v8.s[3]\n"
- "fmla v16.4s, v31.4s, v10.s[0]\n"
- "fmla v17.4s, v31.4s, v10.s[1]\n"
- "fmla v18.4s, v31.4s, v10.s[2]\n"
- "fmla v19.4s, v31.4s, v10.s[3]\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x110]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x120]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x130]\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
"ldr q31, [%x[params], #0x150]\n"
- "fmla v12.4s, v30.4s, v8.s[1]\n"
- "fmla v13.4s, v30.4s, v8.s[2]\n"
- "fmla v14.4s, v30.4s, v8.s[3]\n"
- "fmla v15.4s, v30.4s, v9.s[0]\n"
- "fmla v16.4s, v30.4s, v10.s[1]\n"
- "fmla v17.4s, v30.4s, v10.s[2]\n"
- "fmla v18.4s, v30.4s, v10.s[3]\n"
- "fmla v19.4s, v30.4s, v11.s[0]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
"ldr q30, [%x[params], #0x160]\n"
- "fmla v12.4s, v29.4s, v8.s[2]\n"
- "fmla v13.4s, v29.4s, v8.s[3]\n"
- "fmla v14.4s, v29.4s, v9.s[0]\n"
- "fmla v15.4s, v29.4s, v9.s[1]\n"
- "fmla v16.4s, v29.4s, v10.s[2]\n"
- "fmla v17.4s, v29.4s, v10.s[3]\n"
- "fmla v18.4s, v29.4s, v11.s[0]\n"
- "fmla v19.4s, v29.4s, v11.s[1]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
"ldr q29, [%x[params], #0x170]\n"
- "fmla v12.4s, v28.4s, v8.s[3]\n"
- "fmla v13.4s, v28.4s, v9.s[0]\n"
- "fmla v14.4s, v28.4s, v9.s[1]\n"
- "fmla v15.4s, v28.4s, v9.s[2]\n"
- "fmla v16.4s, v28.4s, v10.s[3]\n"
- "fmla v17.4s, v28.4s, v11.s[0]\n"
- "fmla v18.4s, v28.4s, v11.s[1]\n"
- "fmla v19.4s, v28.4s, v11.s[2]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
"ldr q28, [%x[params], #0x180]\n"
- "fmla v12.4s, v27.4s, v9.s[0]\n"
- "fmla v13.4s, v27.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmla v14.4s, v27.4s, v9.s[2]\n"
- "fmla v15.4s, v27.4s, v9.s[3]\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
"str q12, [x12, x13]\n"
"ldr q12, [%x[params], #0x140]\n"
- "fmla v16.4s, v27.4s, v11.s[0]\n"
- "fmla v17.4s, v27.4s, v11.s[1]\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmla v18.4s, v27.4s, v11.s[2]\n"
- "fmla v19.4s, v27.4s, v11.s[3]\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
"ldr q27, [%x[params], #0x190]\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmin v15.4s, v15.4s, v20.4s\n"
- "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
"add %x[params], %x[params], #0x1a0\n"
- "fmin v17.4s, v17.4s, v20.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
"str q13, [x11, x13]\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
"str q14, [x10, x13]\n"
- "fmax v16.4s, v16.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
"str q15, [x9, x13]\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
"str q16, [x28, x13]\n"
"str q17, [x27, x13]\n"
"mov v13.16b, v12.16b\n"
@@ -359,7 +359,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v31.4s, v2.s[1]\n"
"fmla v18.4s, v31.4s, v2.s[2]\n"
"fmla v19.4s, v31.4s, v2.s[3]\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q24, [%x[params], #0x0]\n"
"fmla v12.4s, v30.4s, v0.s[1]\n"
"fmla v13.4s, v30.4s, v0.s[2]\n"
"fmla v14.4s, v30.4s, v0.s[3]\n"
@@ -368,7 +368,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v30.4s, v2.s[2]\n"
"fmla v18.4s, v30.4s, v2.s[3]\n"
"fmla v19.4s, v30.4s, v3.s[0]\n"
- "ldr q30, [%x[params], #0x10]\n"
+ "ldr q23, [%x[params], #0x10]\n"
"fmla v12.4s, v29.4s, v0.s[2]\n"
"fmla v13.4s, v29.4s, v0.s[3]\n"
"fmla v14.4s, v29.4s, v1.s[0]\n"
@@ -377,7 +377,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v29.4s, v2.s[3]\n"
"fmla v18.4s, v29.4s, v3.s[0]\n"
"fmla v19.4s, v29.4s, v3.s[1]\n"
- "ldr q29, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x20]\n"
"fmla v12.4s, v28.4s, v0.s[3]\n"
"fmla v13.4s, v28.4s, v1.s[0]\n"
"fmla v14.4s, v28.4s, v1.s[1]\n"
@@ -386,7 +386,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v28.4s, v3.s[0]\n"
"fmla v18.4s, v28.4s, v3.s[1]\n"
"fmla v19.4s, v28.4s, v3.s[2]\n"
- "ldr q28, [%x[params], #0x30]\n"
+ "ldr q21, [%x[params], #0x30]\n"
"fmla v12.4s, v27.4s, v1.s[0]\n"
"fmla v13.4s, v27.4s, v1.s[1]\n"
"fmla v14.4s, v27.4s, v1.s[2]\n"
@@ -395,202 +395,202 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v17.4s, v27.4s, v3.s[1]\n"
"fmla v18.4s, v27.4s, v3.s[2]\n"
"fmla v19.4s, v27.4s, v3.s[3]\n"
- "ldr q27, [%x[params], #0x40]\n"
- "fmla v12.4s, v31.4s, v2.s[0]\n"
- "fmla v13.4s, v31.4s, v2.s[1]\n"
- "fmla v14.4s, v31.4s, v2.s[2]\n"
- "fmla v15.4s, v31.4s, v2.s[3]\n"
- "fmla v16.4s, v31.4s, v4.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[1]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v4.s[3]\n"
- "ldr q31, [%x[params], #0x50]\n"
- "fmla v12.4s, v30.4s, v2.s[1]\n"
- "fmla v13.4s, v30.4s, v2.s[2]\n"
- "fmla v14.4s, v30.4s, v2.s[3]\n"
- "fmla v15.4s, v30.4s, v3.s[0]\n"
- "fmla v16.4s, v30.4s, v4.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[2]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[0]\n"
- "ldr q30, [%x[params], #0x60]\n"
- "fmla v12.4s, v29.4s, v2.s[2]\n"
- "fmla v13.4s, v29.4s, v2.s[3]\n"
- "fmla v14.4s, v29.4s, v3.s[0]\n"
- "fmla v15.4s, v29.4s, v3.s[1]\n"
- "fmla v16.4s, v29.4s, v4.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[3]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[1]\n"
- "ldr q29, [%x[params], #0x70]\n"
- "fmla v12.4s, v28.4s, v2.s[3]\n"
- "fmla v13.4s, v28.4s, v3.s[0]\n"
- "fmla v14.4s, v28.4s, v3.s[1]\n"
- "fmla v15.4s, v28.4s, v3.s[2]\n"
- "fmla v16.4s, v28.4s, v4.s[3]\n"
- "fmla v17.4s, v28.4s, v5.s[0]\n"
- "fmla v18.4s, v28.4s, v5.s[1]\n"
- "fmla v19.4s, v28.4s, v5.s[2]\n"
- "ldr q28, [%x[params], #0x80]\n"
- "fmla v12.4s, v27.4s, v3.s[0]\n"
- "fmla v13.4s, v27.4s, v3.s[1]\n"
- "fmla v14.4s, v27.4s, v3.s[2]\n"
- "fmla v15.4s, v27.4s, v3.s[3]\n"
- "fmla v16.4s, v27.4s, v5.s[0]\n"
- "fmla v17.4s, v27.4s, v5.s[1]\n"
- "fmla v18.4s, v27.4s, v5.s[2]\n"
- "fmla v19.4s, v27.4s, v5.s[3]\n"
- "ldr q27, [%x[params], #0x90]\n"
- "fmla v12.4s, v31.4s, v4.s[0]\n"
- "fmla v13.4s, v31.4s, v4.s[1]\n"
- "fmla v14.4s, v31.4s, v4.s[2]\n"
- "fmla v15.4s, v31.4s, v4.s[3]\n"
- "fmla v16.4s, v31.4s, v6.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[1]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v6.s[3]\n"
- "ldr q31, [%x[params], #0xa0]\n"
- "fmla v12.4s, v30.4s, v4.s[1]\n"
- "fmla v13.4s, v30.4s, v4.s[2]\n"
- "fmla v14.4s, v30.4s, v4.s[3]\n"
- "fmla v15.4s, v30.4s, v5.s[0]\n"
- "fmla v16.4s, v30.4s, v6.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[2]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[0]\n"
- "ldr q30, [%x[params], #0xb0]\n"
- "fmla v12.4s, v29.4s, v4.s[2]\n"
- "fmla v13.4s, v29.4s, v4.s[3]\n"
- "fmla v14.4s, v29.4s, v5.s[0]\n"
- "fmla v15.4s, v29.4s, v5.s[1]\n"
- "fmla v16.4s, v29.4s, v6.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[3]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[1]\n"
- "ldr q29, [%x[params], #0xc0]\n"
- "fmla v12.4s, v28.4s, v4.s[3]\n"
- "fmla v13.4s, v28.4s, v5.s[0]\n"
- "fmla v14.4s, v28.4s, v5.s[1]\n"
- "fmla v15.4s, v28.4s, v5.s[2]\n"
- "fmla v16.4s, v28.4s, v6.s[3]\n"
- "fmla v17.4s, v28.4s, v7.s[0]\n"
- "fmla v18.4s, v28.4s, v7.s[1]\n"
- "fmla v19.4s, v28.4s, v7.s[2]\n"
- "ldr q28, [%x[params], #0xd0]\n"
- "fmla v12.4s, v27.4s, v5.s[0]\n"
- "fmla v13.4s, v27.4s, v5.s[1]\n"
- "fmla v14.4s, v27.4s, v5.s[2]\n"
- "fmla v15.4s, v27.4s, v5.s[3]\n"
- "fmla v16.4s, v27.4s, v7.s[0]\n"
- "fmla v17.4s, v27.4s, v7.s[1]\n"
- "fmla v18.4s, v27.4s, v7.s[2]\n"
- "fmla v19.4s, v27.4s, v7.s[3]\n"
- "ldr q27, [%x[params], #0xe0]\n"
- "fmla v12.4s, v31.4s, v6.s[0]\n"
- "fmla v13.4s, v31.4s, v6.s[1]\n"
- "fmla v14.4s, v31.4s, v6.s[2]\n"
- "fmla v15.4s, v31.4s, v6.s[3]\n"
- "fmla v16.4s, v31.4s, v8.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[1]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v8.s[3]\n"
- "ldr q31, [%x[params], #0xf0]\n"
- "fmla v12.4s, v30.4s, v6.s[1]\n"
- "fmla v13.4s, v30.4s, v6.s[2]\n"
- "fmla v14.4s, v30.4s, v6.s[3]\n"
- "fmla v15.4s, v30.4s, v7.s[0]\n"
- "fmla v16.4s, v30.4s, v8.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[2]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[0]\n"
- "ldr q30, [%x[params], #0x100]\n"
- "fmla v12.4s, v29.4s, v6.s[2]\n"
- "fmla v13.4s, v29.4s, v6.s[3]\n"
- "fmla v14.4s, v29.4s, v7.s[0]\n"
- "fmla v15.4s, v29.4s, v7.s[1]\n"
- "fmla v16.4s, v29.4s, v8.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[3]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[1]\n"
- "ldr q29, [%x[params], #0x110]\n"
- "fmla v12.4s, v28.4s, v6.s[3]\n"
- "fmla v13.4s, v28.4s, v7.s[0]\n"
- "fmla v14.4s, v28.4s, v7.s[1]\n"
- "fmla v15.4s, v28.4s, v7.s[2]\n"
- "fmla v16.4s, v28.4s, v8.s[3]\n"
- "fmla v17.4s, v28.4s, v9.s[0]\n"
- "fmla v18.4s, v28.4s, v9.s[1]\n"
- "fmla v19.4s, v28.4s, v9.s[2]\n"
- "ldr q28, [%x[params], #0x120]\n"
- "fmla v12.4s, v27.4s, v7.s[0]\n"
- "fmla v13.4s, v27.4s, v7.s[1]\n"
- "fmla v14.4s, v27.4s, v7.s[2]\n"
- "fmla v15.4s, v27.4s, v7.s[3]\n"
- "fmla v16.4s, v27.4s, v9.s[0]\n"
- "fmla v17.4s, v27.4s, v9.s[1]\n"
- "fmla v18.4s, v27.4s, v9.s[2]\n"
- "fmla v19.4s, v27.4s, v9.s[3]\n"
- "ldr q27, [%x[params], #0x130]\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x110]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x120]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x130]\n"
"add %x[params], %x[params], #0x140\n"
- "fmla v12.4s, v31.4s, v8.s[0]\n"
- "fmla v13.4s, v31.4s, v8.s[1]\n"
- "fmla v14.4s, v31.4s, v8.s[2]\n"
- "fmla v15.4s, v31.4s, v8.s[3]\n"
- "fmla v16.4s, v31.4s, v10.s[0]\n"
- "fmla v17.4s, v31.4s, v10.s[1]\n"
- "fmla v18.4s, v31.4s, v10.s[2]\n"
- "fmla v19.4s, v31.4s, v10.s[3]\n"
- "fmla v12.4s, v30.4s, v8.s[1]\n"
- "fmla v13.4s, v30.4s, v8.s[2]\n"
- "fmla v14.4s, v30.4s, v8.s[3]\n"
- "fmla v15.4s, v30.4s, v9.s[0]\n"
- "fmla v16.4s, v30.4s, v10.s[1]\n"
- "fmla v17.4s, v30.4s, v10.s[2]\n"
- "fmla v18.4s, v30.4s, v10.s[3]\n"
- "fmla v19.4s, v30.4s, v11.s[0]\n"
- "fmla v12.4s, v29.4s, v8.s[2]\n"
- "fmla v13.4s, v29.4s, v8.s[3]\n"
- "fmla v14.4s, v29.4s, v9.s[0]\n"
- "fmla v15.4s, v29.4s, v9.s[1]\n"
- "fmla v16.4s, v29.4s, v10.s[2]\n"
- "fmla v17.4s, v29.4s, v10.s[3]\n"
- "fmla v18.4s, v29.4s, v11.s[0]\n"
- "fmla v19.4s, v29.4s, v11.s[1]\n"
- "fmla v12.4s, v28.4s, v8.s[3]\n"
- "fmla v13.4s, v28.4s, v9.s[0]\n"
- "fmla v14.4s, v28.4s, v9.s[1]\n"
- "fmla v15.4s, v28.4s, v9.s[2]\n"
- "fmla v16.4s, v28.4s, v10.s[3]\n"
- "fmla v17.4s, v28.4s, v11.s[0]\n"
- "fmla v18.4s, v28.4s, v11.s[1]\n"
- "fmla v19.4s, v28.4s, v11.s[2]\n"
- "fmla v12.4s, v27.4s, v9.s[0]\n"
- "fmla v13.4s, v27.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmla v14.4s, v27.4s, v9.s[2]\n"
- "fmla v15.4s, v27.4s, v9.s[3]\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmla v16.4s, v27.4s, v11.s[0]\n"
- "fmla v17.4s, v27.4s, v11.s[1]\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmla v18.4s, v27.4s, v11.s[2]\n"
- "fmla v19.4s, v27.4s, v11.s[3]\n"
- "fmin v15.4s, v15.4s, v20.4s\n"
- "fmin v16.4s, v16.4s, v20.4s\n"
- "fmin v17.4s, v17.4s, v20.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
"str q12, [x12, x13]\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
"str q13, [x11, x13]\n"
- "fmax v16.4s, v16.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
"str q14, [x10, x13]\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
"str q15, [x9, x13]\n"
"str q16, [x28, x13]\n"
"str q17, [x27, x13]\n"
@@ -601,255 +601,255 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q12, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x10]\n"
"mov v13.16b, v12.16b\n"
"mov v14.16b, v12.16b\n"
- "ldr q30, [%x[params], #0x20]\n"
- "ldr q29, [%x[params], #0x30]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
"mov v15.16b, v12.16b\n"
"mov v16.16b, v12.16b\n"
- "ldr q28, [%x[params], #0x40]\n"
- "ldr q27, [%x[params], #0x50]\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
"mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v12.16b\n"
- "fmla v12.4s, v31.4s, v0.s[0]\n"
- "fmla v13.4s, v31.4s, v0.s[1]\n"
- "fmla v14.4s, v31.4s, v0.s[2]\n"
- "fmla v15.4s, v31.4s, v0.s[3]\n"
- "fmla v16.4s, v31.4s, v2.s[0]\n"
- "fmla v17.4s, v31.4s, v2.s[1]\n"
- "fmla v18.4s, v31.4s, v2.s[2]\n"
- "fmla v19.4s, v31.4s, v2.s[3]\n"
- "ldr q31, [%x[params], #0x60]\n"
- "fmla v12.4s, v30.4s, v0.s[1]\n"
- "fmla v13.4s, v30.4s, v0.s[2]\n"
- "fmla v14.4s, v30.4s, v0.s[3]\n"
- "fmla v15.4s, v30.4s, v1.s[0]\n"
- "fmla v16.4s, v30.4s, v2.s[1]\n"
- "fmla v17.4s, v30.4s, v2.s[2]\n"
- "fmla v18.4s, v30.4s, v2.s[3]\n"
- "fmla v19.4s, v30.4s, v3.s[0]\n"
- "ldr q30, [%x[params], #0x70]\n"
- "fmla v12.4s, v29.4s, v0.s[2]\n"
- "fmla v13.4s, v29.4s, v0.s[3]\n"
- "fmla v14.4s, v29.4s, v1.s[0]\n"
- "fmla v15.4s, v29.4s, v1.s[1]\n"
- "fmla v16.4s, v29.4s, v2.s[2]\n"
- "fmla v17.4s, v29.4s, v2.s[3]\n"
- "fmla v18.4s, v29.4s, v3.s[0]\n"
- "fmla v19.4s, v29.4s, v3.s[1]\n"
- "ldr q29, [%x[params], #0x80]\n"
- "fmla v12.4s, v28.4s, v0.s[3]\n"
- "fmla v13.4s, v28.4s, v1.s[0]\n"
- "fmla v14.4s, v28.4s, v1.s[1]\n"
- "fmla v15.4s, v28.4s, v1.s[2]\n"
- "fmla v16.4s, v28.4s, v2.s[3]\n"
- "fmla v17.4s, v28.4s, v3.s[0]\n"
- "fmla v18.4s, v28.4s, v3.s[1]\n"
- "fmla v19.4s, v28.4s, v3.s[2]\n"
- "ldr q28, [%x[params], #0x90]\n"
- "fmla v12.4s, v27.4s, v1.s[0]\n"
- "fmla v13.4s, v27.4s, v1.s[1]\n"
- "fmla v14.4s, v27.4s, v1.s[2]\n"
- "fmla v15.4s, v27.4s, v1.s[3]\n"
- "fmla v16.4s, v27.4s, v3.s[0]\n"
- "fmla v17.4s, v27.4s, v3.s[1]\n"
- "fmla v18.4s, v27.4s, v3.s[2]\n"
- "fmla v19.4s, v27.4s, v3.s[3]\n"
- "ldr q27, [%x[params], #0xa0]\n"
- "fmla v12.4s, v31.4s, v2.s[0]\n"
- "fmla v13.4s, v31.4s, v2.s[1]\n"
- "fmla v14.4s, v31.4s, v2.s[2]\n"
- "fmla v15.4s, v31.4s, v2.s[3]\n"
- "fmla v16.4s, v31.4s, v4.s[0]\n"
- "fmla v17.4s, v31.4s, v4.s[1]\n"
- "fmla v18.4s, v31.4s, v4.s[2]\n"
- "fmla v19.4s, v31.4s, v4.s[3]\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "fmla v12.4s, v30.4s, v2.s[1]\n"
- "fmla v13.4s, v30.4s, v2.s[2]\n"
- "fmla v14.4s, v30.4s, v2.s[3]\n"
- "fmla v15.4s, v30.4s, v3.s[0]\n"
- "fmla v16.4s, v30.4s, v4.s[1]\n"
- "fmla v17.4s, v30.4s, v4.s[2]\n"
- "fmla v18.4s, v30.4s, v4.s[3]\n"
- "fmla v19.4s, v30.4s, v5.s[0]\n"
- "ldr q30, [%x[params], #0xc0]\n"
- "fmla v12.4s, v29.4s, v2.s[2]\n"
- "fmla v13.4s, v29.4s, v2.s[3]\n"
- "fmla v14.4s, v29.4s, v3.s[0]\n"
- "fmla v15.4s, v29.4s, v3.s[1]\n"
- "fmla v16.4s, v29.4s, v4.s[2]\n"
- "fmla v17.4s, v29.4s, v4.s[3]\n"
- "fmla v18.4s, v29.4s, v5.s[0]\n"
- "fmla v19.4s, v29.4s, v5.s[1]\n"
- "ldr q29, [%x[params], #0xd0]\n"
- "fmla v12.4s, v28.4s, v2.s[3]\n"
- "fmla v13.4s, v28.4s, v3.s[0]\n"
- "fmla v14.4s, v28.4s, v3.s[1]\n"
- "fmla v15.4s, v28.4s, v3.s[2]\n"
- "fmla v16.4s, v28.4s, v4.s[3]\n"
- "fmla v17.4s, v28.4s, v5.s[0]\n"
- "fmla v18.4s, v28.4s, v5.s[1]\n"
- "fmla v19.4s, v28.4s, v5.s[2]\n"
- "ldr q28, [%x[params], #0xe0]\n"
- "fmla v12.4s, v27.4s, v3.s[0]\n"
- "fmla v13.4s, v27.4s, v3.s[1]\n"
- "fmla v14.4s, v27.4s, v3.s[2]\n"
- "fmla v15.4s, v27.4s, v3.s[3]\n"
- "fmla v16.4s, v27.4s, v5.s[0]\n"
- "fmla v17.4s, v27.4s, v5.s[1]\n"
- "fmla v18.4s, v27.4s, v5.s[2]\n"
- "fmla v19.4s, v27.4s, v5.s[3]\n"
- "ldr q27, [%x[params], #0xf0]\n"
- "fmla v12.4s, v31.4s, v4.s[0]\n"
- "fmla v13.4s, v31.4s, v4.s[1]\n"
- "fmla v14.4s, v31.4s, v4.s[2]\n"
- "fmla v15.4s, v31.4s, v4.s[3]\n"
- "fmla v16.4s, v31.4s, v6.s[0]\n"
- "fmla v17.4s, v31.4s, v6.s[1]\n"
- "fmla v18.4s, v31.4s, v6.s[2]\n"
- "fmla v19.4s, v31.4s, v6.s[3]\n"
- "ldr q31, [%x[params], #0x100]\n"
- "fmla v12.4s, v30.4s, v4.s[1]\n"
- "fmla v13.4s, v30.4s, v4.s[2]\n"
- "fmla v14.4s, v30.4s, v4.s[3]\n"
- "fmla v15.4s, v30.4s, v5.s[0]\n"
- "fmla v16.4s, v30.4s, v6.s[1]\n"
- "fmla v17.4s, v30.4s, v6.s[2]\n"
- "fmla v18.4s, v30.4s, v6.s[3]\n"
- "fmla v19.4s, v30.4s, v7.s[0]\n"
- "ldr q30, [%x[params], #0x110]\n"
- "fmla v12.4s, v29.4s, v4.s[2]\n"
- "fmla v13.4s, v29.4s, v4.s[3]\n"
- "fmla v14.4s, v29.4s, v5.s[0]\n"
- "fmla v15.4s, v29.4s, v5.s[1]\n"
- "fmla v16.4s, v29.4s, v6.s[2]\n"
- "fmla v17.4s, v29.4s, v6.s[3]\n"
- "fmla v18.4s, v29.4s, v7.s[0]\n"
- "fmla v19.4s, v29.4s, v7.s[1]\n"
- "ldr q29, [%x[params], #0x120]\n"
- "fmla v12.4s, v28.4s, v4.s[3]\n"
- "fmla v13.4s, v28.4s, v5.s[0]\n"
- "fmla v14.4s, v28.4s, v5.s[1]\n"
- "fmla v15.4s, v28.4s, v5.s[2]\n"
- "fmla v16.4s, v28.4s, v6.s[3]\n"
- "fmla v17.4s, v28.4s, v7.s[0]\n"
- "fmla v18.4s, v28.4s, v7.s[1]\n"
- "fmla v19.4s, v28.4s, v7.s[2]\n"
- "ldr q28, [%x[params], #0x130]\n"
- "fmla v12.4s, v27.4s, v5.s[0]\n"
- "fmla v13.4s, v27.4s, v5.s[1]\n"
- "fmla v14.4s, v27.4s, v5.s[2]\n"
- "fmla v15.4s, v27.4s, v5.s[3]\n"
- "fmla v16.4s, v27.4s, v7.s[0]\n"
- "fmla v17.4s, v27.4s, v7.s[1]\n"
- "fmla v18.4s, v27.4s, v7.s[2]\n"
- "fmla v19.4s, v27.4s, v7.s[3]\n"
- "ldr q27, [%x[params], #0x140]\n"
- "fmla v12.4s, v31.4s, v6.s[0]\n"
- "fmla v13.4s, v31.4s, v6.s[1]\n"
- "fmla v14.4s, v31.4s, v6.s[2]\n"
- "fmla v15.4s, v31.4s, v6.s[3]\n"
- "fmla v16.4s, v31.4s, v8.s[0]\n"
- "fmla v17.4s, v31.4s, v8.s[1]\n"
- "fmla v18.4s, v31.4s, v8.s[2]\n"
- "fmla v19.4s, v31.4s, v8.s[3]\n"
- "ldr q31, [%x[params], #0x150]\n"
- "fmla v12.4s, v30.4s, v6.s[1]\n"
- "fmla v13.4s, v30.4s, v6.s[2]\n"
- "fmla v14.4s, v30.4s, v6.s[3]\n"
- "fmla v15.4s, v30.4s, v7.s[0]\n"
- "fmla v16.4s, v30.4s, v8.s[1]\n"
- "fmla v17.4s, v30.4s, v8.s[2]\n"
- "fmla v18.4s, v30.4s, v8.s[3]\n"
- "fmla v19.4s, v30.4s, v9.s[0]\n"
- "ldr q30, [%x[params], #0x160]\n"
- "fmla v12.4s, v29.4s, v6.s[2]\n"
- "fmla v13.4s, v29.4s, v6.s[3]\n"
- "fmla v14.4s, v29.4s, v7.s[0]\n"
- "fmla v15.4s, v29.4s, v7.s[1]\n"
- "fmla v16.4s, v29.4s, v8.s[2]\n"
- "fmla v17.4s, v29.4s, v8.s[3]\n"
- "fmla v18.4s, v29.4s, v9.s[0]\n"
- "fmla v19.4s, v29.4s, v9.s[1]\n"
- "ldr q29, [%x[params], #0x170]\n"
- "fmla v12.4s, v28.4s, v6.s[3]\n"
- "fmla v13.4s, v28.4s, v7.s[0]\n"
- "fmla v14.4s, v28.4s, v7.s[1]\n"
- "fmla v15.4s, v28.4s, v7.s[2]\n"
- "fmla v16.4s, v28.4s, v8.s[3]\n"
- "fmla v17.4s, v28.4s, v9.s[0]\n"
- "fmla v18.4s, v28.4s, v9.s[1]\n"
- "fmla v19.4s, v28.4s, v9.s[2]\n"
- "ldr q28, [%x[params], #0x180]\n"
- "fmla v12.4s, v27.4s, v7.s[0]\n"
- "fmla v13.4s, v27.4s, v7.s[1]\n"
- "fmla v14.4s, v27.4s, v7.s[2]\n"
- "fmla v15.4s, v27.4s, v7.s[3]\n"
- "fmla v16.4s, v27.4s, v9.s[0]\n"
- "fmla v17.4s, v27.4s, v9.s[1]\n"
- "fmla v18.4s, v27.4s, v9.s[2]\n"
- "fmla v19.4s, v27.4s, v9.s[3]\n"
- "ldr q27, [%x[params], #0x190]\n"
+ "fmla v12.4s, v24.4s, v0.s[0]\n"
+ "fmla v13.4s, v24.4s, v0.s[1]\n"
+ "fmla v14.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v0.s[3]\n"
+ "fmla v16.4s, v24.4s, v2.s[0]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v18.4s, v24.4s, v2.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "ldr q24, [%x[params], #0x60]\n"
+ "fmla v12.4s, v23.4s, v0.s[1]\n"
+ "fmla v13.4s, v23.4s, v0.s[2]\n"
+ "fmla v14.4s, v23.4s, v0.s[3]\n"
+ "fmla v15.4s, v23.4s, v1.s[0]\n"
+ "fmla v16.4s, v23.4s, v2.s[1]\n"
+ "fmla v17.4s, v23.4s, v2.s[2]\n"
+ "fmla v18.4s, v23.4s, v2.s[3]\n"
+ "fmla v19.4s, v23.4s, v3.s[0]\n"
+ "ldr q23, [%x[params], #0x70]\n"
+ "fmla v12.4s, v22.4s, v0.s[2]\n"
+ "fmla v13.4s, v22.4s, v0.s[3]\n"
+ "fmla v14.4s, v22.4s, v1.s[0]\n"
+ "fmla v15.4s, v22.4s, v1.s[1]\n"
+ "fmla v16.4s, v22.4s, v2.s[2]\n"
+ "fmla v17.4s, v22.4s, v2.s[3]\n"
+ "fmla v18.4s, v22.4s, v3.s[0]\n"
+ "fmla v19.4s, v22.4s, v3.s[1]\n"
+ "ldr q22, [%x[params], #0x80]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v13.4s, v21.4s, v1.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v15.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "fmla v17.4s, v21.4s, v3.s[0]\n"
+ "fmla v18.4s, v21.4s, v3.s[1]\n"
+ "fmla v19.4s, v21.4s, v3.s[2]\n"
+ "ldr q21, [%x[params], #0x90]\n"
+ "fmla v12.4s, v20.4s, v1.s[0]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v14.4s, v20.4s, v1.s[2]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "fmla v16.4s, v20.4s, v3.s[0]\n"
+ "fmla v17.4s, v20.4s, v3.s[1]\n"
+ "fmla v18.4s, v20.4s, v3.s[2]\n"
+ "fmla v19.4s, v20.4s, v3.s[3]\n"
+ "ldr q20, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0x100]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0x110]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0x120]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0x130]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0x140]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0x150]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x160]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x170]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x180]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x190]\n"
"add %x[params], %x[params], #0x1a0\n"
- "fmla v12.4s, v31.4s, v8.s[0]\n"
- "fmla v13.4s, v31.4s, v8.s[1]\n"
- "fmla v14.4s, v31.4s, v8.s[2]\n"
- "fmla v15.4s, v31.4s, v8.s[3]\n"
- "fmla v16.4s, v31.4s, v10.s[0]\n"
- "fmla v17.4s, v31.4s, v10.s[1]\n"
- "fmla v18.4s, v31.4s, v10.s[2]\n"
- "fmla v19.4s, v31.4s, v10.s[3]\n"
- "fmla v12.4s, v30.4s, v8.s[1]\n"
- "fmla v13.4s, v30.4s, v8.s[2]\n"
- "fmla v14.4s, v30.4s, v8.s[3]\n"
- "fmla v15.4s, v30.4s, v9.s[0]\n"
- "fmla v16.4s, v30.4s, v10.s[1]\n"
- "fmla v17.4s, v30.4s, v10.s[2]\n"
- "fmla v18.4s, v30.4s, v10.s[3]\n"
- "fmla v19.4s, v30.4s, v11.s[0]\n"
- "fmla v12.4s, v29.4s, v8.s[2]\n"
- "fmla v13.4s, v29.4s, v8.s[3]\n"
- "fmla v14.4s, v29.4s, v9.s[0]\n"
- "fmla v15.4s, v29.4s, v9.s[1]\n"
- "fmla v16.4s, v29.4s, v10.s[2]\n"
- "fmla v17.4s, v29.4s, v10.s[3]\n"
- "fmla v18.4s, v29.4s, v11.s[0]\n"
- "fmla v19.4s, v29.4s, v11.s[1]\n"
- "fmla v12.4s, v28.4s, v8.s[3]\n"
- "fmla v13.4s, v28.4s, v9.s[0]\n"
- "fmla v14.4s, v28.4s, v9.s[1]\n"
- "fmla v15.4s, v28.4s, v9.s[2]\n"
- "fmla v16.4s, v28.4s, v10.s[3]\n"
- "fmla v17.4s, v28.4s, v11.s[0]\n"
- "fmla v18.4s, v28.4s, v11.s[1]\n"
- "fmla v19.4s, v28.4s, v11.s[2]\n"
- "fmla v12.4s, v27.4s, v9.s[0]\n"
- "fmla v13.4s, v27.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmla v14.4s, v27.4s, v9.s[2]\n"
- "fmla v15.4s, v27.4s, v9.s[3]\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmla v16.4s, v27.4s, v11.s[0]\n"
- "fmla v17.4s, v27.4s, v11.s[1]\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmla v18.4s, v27.4s, v11.s[2]\n"
- "fmla v19.4s, v27.4s, v11.s[3]\n"
- "fmin v15.4s, v15.4s, v20.4s\n"
- "fmin v16.4s, v16.4s, v20.4s\n"
- "fmin v17.4s, v17.4s, v20.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "fmax v15.4s, v15.4s, v21.4s\n"
- "fmax v16.4s, v16.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v21.4s\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
"add x20, x12, x13\n"
"add x21, x11, x13\n"
@@ -904,15 +904,14 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"st1 { v18.s }[0], [x21]\n"
"st1 { v19.s }[0], [x20]\n"
"5:" // Output channel oddments: Store: Bit 1: End
-
"6:" // End
-
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index d60e15ec84..3bece73973 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index c28f29c4f9..cc18dd4bb4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -43,10 +44,10 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v11.4s }, [%x[minmax_vals]]\n"
+ "ld1r { v12.4s }, [%x[minmax_vals]]\n"
"lsr x11, %x[n_output_channels], #0x2\n"
"add x20, %x[minmax_vals], #0x4\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"mov x10, #0x0\n"
"cbz x11, 8f\n"
"1:" // Output channel loop
@@ -55,16 +56,16 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"lsl x20, x10, #0x2\n"
"ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "ldr q9, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q8, [x23, #0x0]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q5, [x9, #0x10]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"mov v18.16b, v31.16b\n"
"mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
@@ -80,368 +81,368 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 6f\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 6f\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q2, [x23, #0x10]\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q0, [x9, #0x10]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "ldp x23, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q3, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
+ "ldr q8, [x21, #0x0]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q7, [x21, #0x10]\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q6, [x20, #0x0]\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q5, [x9, #0x10]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "ldr q3, [x23, #0x0]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "ldr q2, [x23, #0x10]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [x9, #0x0]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q4, [%x[weights], #0x10]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr q9, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "lsl x28, x10, #0x2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "lsl x28, x10, #0x2\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmin v16.4s, v16.4s, v10.4s\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "str q24, [x20, x28]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q25, [x21, x28]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "str q26, [x22, x28]\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
- "ldp x23, x9, [x20], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "ldp x20, x9, [x22], #0x10\n"
"lsl x28, x10, #0x2\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "ldr q5, [x9, #0x10]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "add %x[weights], %x[weights], #0x10\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q2, [x9, #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q1, [%x[weights], #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmin v16.4s, v16.4s, v10.4s\n"
+ "add %x[weights], %x[weights], #0x10\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "str q16, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "str q17, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "str q18, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q19, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q20, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "str q21, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q22, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "str q23, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "str q24, [x20, x28]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q25, [x21, x28]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "str q26, [x22, x28]\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v16.4s, v1.4s, v4.s[0]\n"
+ "fmla v17.4s, v1.4s, v4.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmla v18.4s, v1.4s, v4.s[2]\n"
+ "fmla v19.4s, v1.4s, v4.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmla v20.4s, v1.4s, v3.s[0]\n"
+ "fmla v21.4s, v1.4s, v3.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmla v22.4s, v1.4s, v3.s[2]\n"
+ "fmla v23.4s, v1.4s, v3.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v2.s[0]\n"
+ "fmla v25.4s, v1.4s, v2.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v2.s[2]\n"
+ "fmla v27.4s, v1.4s, v2.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v0.s[0]\n"
+ "fmla v29.4s, v1.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v0.s[2]\n"
+ "fmla v31.4s, v1.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmin v16.4s, v16.4s, v10.4s\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
"lsl x28, x10, #0x2\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "str q16, [x20, x28]\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "str q17, [x21, x28]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "str q18, [x22, x28]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q19, [x23, x28]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q20, [x24, x28]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "str q21, [x25, x28]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q22, [x26, x28]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "str q23, [x27, x28]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "str q24, [x20, x28]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q25, [x21, x28]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "str q26, [x22, x28]\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q27, [x23, x28]\n"
- "str q28, [x24, x28]\n"
- "str q29, [x25, x28]\n"
- "str q30, [x26, x28]\n"
- "str q31, [x27, x28]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
"7:" // Output channel loop: Done
"add x10, x10, #0x4\n"
"cmp x10, x11, LSL #2\n"
@@ -461,16 +462,16 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"ld1 { v31.s }[0], [x20]\n"
"10:" // Output channel oddments: Load bias: Bit 1: End
"11:" // Output channel oddments: Load bias: Done
- "ldr q9, [%x[weights], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr q8, [x23, #0x0]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q5, [x9, #0x10]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"mov v18.16b, v31.16b\n"
"mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
@@ -486,66 +487,82 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"mov v29.16b, v31.16b\n"
"mov v30.16b, v31.16b\n"
"mov v31.16b, v31.16b\n"
- "cbz x21, 15f\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
+ "cbz x23, 15f\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q3, [x23, #0x0]\n"
- "ldr q2, [x23, #0x10]\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q0, [x9, #0x10]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
"beq 13f\n"
"12:" // Output channel oddments: Kernel loop
- "ldp x23, x9, [x20], #0x10\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q3, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "subs x21, x21, #0x1\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
+ "ldr q8, [x21, #0x0]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
+ "ldr q7, [x21, #0x10]\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q6, [x20, #0x0]\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q5, [x9, #0x10]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "ldr q3, [x23, #0x0]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "ldr q2, [x23, #0x10]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [x9, #0x0]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q4, [%x[weights], #0x10]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr q9, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 12b\n"
"13:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 14f\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
@@ -562,65 +579,33 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
"b 16f\n"
"14:" // Output channel oddments: Odd tail
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q3, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldr q0, [%x[weights], #0x0]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "ldp x23, x9, [x20], #0x10\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x23, #0x0]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "ldr q7, [x23, #0x10]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "ldr q6, [x9, #0x0]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
- "ldr q5, [x9, #0x10]\n"
- "ldr q9, [%x[weights], #0x0]\n"
- "fmla v16.4s, v4.4s, v3.s[0]\n"
- "fmla v17.4s, v4.4s, v3.s[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v18.4s, v4.4s, v3.s[2]\n"
- "fmla v19.4s, v4.4s, v3.s[3]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v21.4s, v4.4s, v2.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[2]\n"
- "fmla v23.4s, v4.4s, v2.s[3]\n"
- "fmla v24.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v27.4s, v4.4s, v1.s[3]\n"
- "fmla v28.4s, v4.4s, v0.s[0]\n"
- "fmla v29.4s, v4.4s, v0.s[1]\n"
- "fmla v30.4s, v4.4s, v0.s[2]\n"
- "fmla v31.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
@@ -635,216 +620,231 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v29.4s, v9.4s, v5.s[1]\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v0.4s, v4.s[0]\n"
+ "fmla v17.4s, v0.4s, v4.s[1]\n"
+ "fmla v18.4s, v0.4s, v4.s[2]\n"
+ "fmla v19.4s, v0.4s, v4.s[3]\n"
+ "fmla v20.4s, v0.4s, v3.s[0]\n"
+ "fmla v21.4s, v0.4s, v3.s[1]\n"
+ "fmla v22.4s, v0.4s, v3.s[2]\n"
+ "fmla v23.4s, v0.4s, v3.s[3]\n"
+ "fmla v24.4s, v0.4s, v2.s[0]\n"
+ "fmla v25.4s, v0.4s, v2.s[1]\n"
+ "fmla v26.4s, v0.4s, v2.s[2]\n"
+ "fmla v27.4s, v0.4s, v2.s[3]\n"
+ "fmla v28.4s, v0.4s, v1.s[0]\n"
+ "fmla v29.4s, v0.4s, v1.s[1]\n"
+ "fmla v30.4s, v0.4s, v1.s[2]\n"
+ "fmla v31.4s, v0.4s, v1.s[3]\n"
"b 16f\n"
"15:" // Output channel oddments: Single kernel point
- "fmla v16.4s, v9.4s, v8.s[0]\n"
- "fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmla v18.4s, v9.4s, v8.s[2]\n"
- "fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmla v20.4s, v9.4s, v7.s[0]\n"
- "fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmla v22.4s, v9.4s, v7.s[2]\n"
- "fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmla v24.4s, v9.4s, v6.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v26.4s, v9.4s, v6.s[2]\n"
- "fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmla v28.4s, v9.4s, v5.s[0]\n"
- "fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmla v30.4s, v9.4s, v5.s[2]\n"
- "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
"16:" // Output channel oddments: Done
- "fmin v16.4s, v16.4s, v10.4s\n"
- "fmin v17.4s, v17.4s, v10.4s\n"
- "fmin v18.4s, v18.4s, v10.4s\n"
- "fmin v19.4s, v19.4s, v10.4s\n"
- "fmin v20.4s, v20.4s, v10.4s\n"
- "fmin v21.4s, v21.4s, v10.4s\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "fmin v30.4s, v30.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v11.4s\n"
- "fmax v17.4s, v17.4s, v11.4s\n"
- "fmax v18.4s, v18.4s, v11.4s\n"
- "fmax v19.4s, v19.4s, v11.4s\n"
- "fmax v20.4s, v20.4s, v11.4s\n"
- "fmax v21.4s, v21.4s, v11.4s\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "fmax v30.4s, v30.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
"tbz %x[n_output_channels], #1, 17f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x21, x21, x10, LSL #2\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #2\n"
- "add x23, x23, x10, LSL #2\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #2\n"
- "add x25, x25, x10, LSL #2\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v16.d }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #2\n"
- "st1 { v17.d }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #2\n"
- "st1 { v18.d }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #2\n"
- "st1 { v19.d }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #2\n"
- "st1 { v20.d }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #2\n"
- "st1 { v21.d }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #2\n"
- "st1 { v22.d }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v23.d }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #2\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
"add x10, x10, #0x2\n"
- "st1 { v24.d }[0], [x20]\n"
- "st1 { v25.d }[0], [x21]\n"
- "st1 { v26.d }[0], [x22]\n"
- "st1 { v27.d }[0], [x23]\n"
- "st1 { v28.d }[0], [x24]\n"
- "st1 { v29.d }[0], [x25]\n"
- "st1 { v30.d }[0], [x26]\n"
- "st1 { v31.d }[0], [x27]\n"
+ "st1 { v24.d }[0], [x27]\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 18f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x21, x21, x10, LSL #2\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #2\n"
- "add x23, x23, x10, LSL #2\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #2\n"
- "add x25, x25, x10, LSL #2\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v16.s }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #2\n"
- "st1 { v17.s }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #2\n"
- "st1 { v18.s }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #2\n"
- "st1 { v19.s }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #2\n"
- "st1 { v20.s }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #2\n"
- "st1 { v21.s }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #2\n"
- "st1 { v22.s }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v23.s }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v24.s }[2], [x20]\n"
- "st1 { v25.s }[2], [x21]\n"
- "st1 { v26.s }[2], [x22]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x25]\n"
- "st1 { v30.s }[2], [x26]\n"
- "st1 { v31.s }[2], [x27]\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v24.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 18f\n"
"17:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x21, x21, x10, LSL #2\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x10, LSL #2\n"
- "add x23, x23, x10, LSL #2\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x10, LSL #2\n"
- "add x25, x25, x10, LSL #2\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x10, LSL #2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v16.s }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x10, LSL #2\n"
- "st1 { v17.s }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x10, LSL #2\n"
- "st1 { v18.s }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x10, LSL #2\n"
- "st1 { v19.s }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x10, LSL #2\n"
- "st1 { v20.s }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x10, LSL #2\n"
- "st1 { v21.s }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x10, LSL #2\n"
- "st1 { v22.s }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v23.s }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x10, LSL #2\n"
- "st1 { v24.s }[0], [x20]\n"
- "st1 { v25.s }[0], [x21]\n"
- "st1 { v26.s }[0], [x22]\n"
- "st1 { v27.s }[0], [x23]\n"
- "st1 { v28.s }[0], [x24]\n"
- "st1 { v29.s }[0], [x25]\n"
- "st1 { v30.s }[0], [x26]\n"
- "st1 { v31.s }[0], [x27]\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v24.s }[0], [x27]\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"18:" // Output channel oddments: Done: Store: Bit 1: End
-
"19:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 79bba40ca3..85053b374c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *,
- const int32_t *,
- const arm_gemm::Requantize32&,
- const int32_t *, const int32_t *,
- int8_t *const *const
-);
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index fda88f94bb..916c8a4afe 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,15 +30,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
"mov x20, #0x1\n"
@@ -47,817 +39,817 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"orr x20, x20, #0x10000\n"
"lsr x11, %x[n_channels], #0x4\n"
- "dup v14.4s, w20\n"
+ "dup v12.4s, w20\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v13.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"mov x28, #0x0\n"
"mov x27, #0x0\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
"cbz x11, 3f\n"
- "ldr q9, [x15, x28]\n"
- "ldr q8, [x14, x28]\n"
- "subs x11, x11, #0x1\n"
- "ldr q7, [x13, x28]\n"
- "ldr q6, [x12, x28]\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q4, [x10, x28]\n"
- "ldr q3, [x9, x28]\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "ldr q2, [x26, x28]\n"
- "ldr q1, [x25, x28]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
+ "ldr q15, [x15, x28]\n"
"ldr q28, [x14, x28]\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "ldr q23, [x9, x28]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q30, [x13, x28]\n"
+ "ldr q8, [x12, x28]\n"
+ "zip2 v19.16b, v15.16b, v30.16b\n"
+ "zip1 v15.16b, v15.16b, v30.16b\n"
+ "ldr q26, [x10, x28]\n"
+ "ldr q0, [x9, x28]\n"
+ "zip1 v7.16b, v28.16b, v8.16b\n"
+ "zip2 v8.16b, v28.16b, v8.16b\n"
+ "ldr q29, [x26, x28]\n"
+ "ldr q10, [x21, x28]\n"
+ "zip2 v25.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v19.16b, v8.16b\n"
+ "zip2 v8.16b, v19.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x30]\n"
+ "zip2 v21.16b, v26.16b, v29.16b\n"
+ "zip1 v26.16b, v26.16b, v29.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "zip1 v27.16b, v0.16b, v10.16b\n"
+ "zip2 v10.16b, v0.16b, v10.16b\n"
+ "ldr q17, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v23.16b, v26.16b, v27.16b\n"
+ "zip1 v26.16b, v26.16b, v27.16b\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "zip2 v28.16b, v22.16b, v9.16b\n"
+ "zip1 v22.16b, v22.16b, v9.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "zip1 v24.16b, v17.16b, v5.16b\n"
+ "zip2 v5.16b, v17.16b, v5.16b\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v3.16b, v21.16b, v10.16b\n"
+ "zip2 v10.16b, v21.16b, v10.16b\n"
+ "ldr q4, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "zip2 v17.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v4.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v19.16b, v22.16b, v24.16b\n"
+ "zip1 v22.16b, v22.16b, v24.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v2.16b, v17.16b, v9.16b\n"
+ "zip2 v9.16b, v17.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
"add x28, x28, #0x10\n"
- ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"subs x11, x11, #0x1\n"
- ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x10]\n"
- ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q26, [%x[params], #0x10]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v15.16b, v31.16b, v26.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v21.16b, v29.16b, v26.16b\n"
+ "and v17.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e979596 // sdot v22.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x4e939596 // sdot v22.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v6.16b, v22.16b\n .inst 0x4e989586 // sdot v6.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v30.16b, v26.16b\n"
+ ".inst 0x4e999596 // sdot v22.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v29.16b, v26.16b\n"
+ "mov v21.16b, v26.16b\n"
+ ".inst 0x4e9995fa // sdot v26.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e9795fd // sdot v29.4s, v15.16b, v23.16b\n"
+ ".inst 0x4e97965a // sdot v26.4s, v18.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
- "ldr q3, [x9, x28]\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
- "ldr q8, [x14, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x4e9995fe // sdot v30.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e9795f5 // sdot v21.4s, v15.16b, v23.16b\n"
+ ".inst 0x4e97959c // sdot v28.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e93965d // sdot v29.4s, v18.16b, v19.16b\n"
+ ".inst 0x4e93977a // sdot v26.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97965e // sdot v30.4s, v18.16b, v23.16b\n"
+ "ldr q4, [x9, x28]\n"
+ ".inst 0x4e939655 // sdot v21.4s, v18.16b, v19.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e93959c // sdot v28.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e98977d // sdot v29.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e93977e // sdot v30.4s, v27.16b, v19.16b\n"
+ ".inst 0x4e989775 // sdot v21.4s, v27.16b, v24.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "mov v17.16b, v28.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e99959c // sdot v28.4s, v12.16b, v25.16b\n"
+ "ldr q31, [x14, x28]\n"
+ "mls v30.4s, v28.4s, v16.4s\n"
+ "mls v29.4s, v6.4s, v16.4s\n"
+ "mls v21.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v30.16b, v20.16b\n"
+ "and v6.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v21.16b, v20.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x90]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v21.4s, v21.4s, v20.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e839596 // sdot v22.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809596 // sdot v22.4s, v12.16b, v0.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x80]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "mov v18.16b, v22.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ ".inst 0x4e879596 // sdot v22.4s, v12.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v6.16b, v26.16b\n"
+ "str s21, [x22, x27]\n"
+ "mov v25.16b, v26.16b\n"
+ "mov v20.16b, v26.16b\n"
+ ".inst 0x4e8795fa // sdot v26.4s, v15.16b, v7.16b\n"
+ ".inst 0x4e8395f9 // sdot v25.4s, v15.16b, v3.16b\n"
+ ".inst 0x4e83979a // sdot v26.4s, v28.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e8795e6 // sdot v6.4s, v15.16b, v7.16b\n"
+ ".inst 0x4e8395f4 // sdot v20.4s, v15.16b, v3.16b\n"
+ ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809799 // sdot v25.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e80971a // sdot v26.4s, v24.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e839786 // sdot v6.4s, v28.16b, v3.16b\n"
+ "ldr q19, [x26, x28]\n"
+ ".inst 0x4e809794 // sdot v20.4s, v28.16b, v0.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e829719 // sdot v25.4s, v24.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
- "ldr q2, [x26, x28]\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
- "ldr q7, [x13, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x4e809706 // sdot v6.4s, v24.16b, v0.16b\n"
+ ".inst 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "mov v17.16b, v23.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
+ "ldr q21, [x13, x28]\n"
+ "mls v6.4s, v23.4s, v16.4s\n"
+ "mls v25.4s, v18.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q15, [%x[params], #0x120]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v6.16b, v1.16b\n"
+ "and v22.16b, v25.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v6.4s, v6.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v6.4s, v6.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v6.4s, v6.4s, v13.4s\n"
"smax v25.4s, v25.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
+ "smin v6.4s, v6.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x4e8a9580 // sdot v0.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e859580 // sdot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "mov v22.16b, v0.16b\n .inst 0x4e899596 // sdot v22.4s, v12.16b, v9.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
+ "str s6, [x24, x27]\n"
+ ".inst 0x4e889580 // sdot v0.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s25, [x23, x27]\n"
+ "mov v29.16b, v28.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v25.16b, v28.16b\n"
+ "mov v7.16b, v28.16b\n"
+ ".inst 0x4e88971c // sdot v28.4s, v24.16b, v8.16b\n"
+ ".inst 0x4e8a9719 // sdot v25.4s, v24.16b, v10.16b\n"
+ ".inst 0x4e8a97dc // sdot v28.4s, v30.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
"movi v17.4s, #0x0\n"
- ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
- "ldr q1, [x25, x28]\n"
- ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
- "ldr q6, [x12, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [x15, x28]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "ldr q28, [x14, x28]\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x160]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x170]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x150]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [x10, x28]\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "ldr q23, [x9, x28]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x4e88971d // sdot v29.4s, v24.16b, v8.16b\n"
+ ".inst 0x4e8a9707 // sdot v7.4s, v24.16b, v10.16b\n"
+ ".inst 0x4e8a9591 // sdot v17.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8597d9 // sdot v25.4s, v30.16b, v5.16b\n"
+ ".inst 0x4e85977c // sdot v28.4s, v27.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a97dd // sdot v29.4s, v30.16b, v10.16b\n"
+ "ldr q10, [x21, x28]\n"
+ ".inst 0x4e8597c7 // sdot v7.4s, v30.16b, v5.16b\n"
+ "mls v28.4s, v0.4s, v16.4s\n"
+ ".inst 0x4e859591 // sdot v17.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e899779 // sdot v25.4s, v27.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e85977d // sdot v29.4s, v27.16b, v5.16b\n"
+ ".inst 0x4e899767 // sdot v7.4s, v27.16b, v9.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+ "mov v18.16b, v17.16b\n .inst 0x4e899592 // sdot v18.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889591 // sdot v17.4s, v12.16b, v8.16b\n"
+ "ldr q8, [x12, x28]\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v25.4s, v22.4s, v16.4s\n"
+ "mls v7.4s, v18.4s, v16.4s\n"
+ "and v17.16b, v28.16b, v23.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x15, x28]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "ldr q3, [x20, x28]\n"
+ "and v24.16b, v29.16b, v23.16b\n"
+ "and v20.16b, v25.16b, v23.16b\n"
+ "and v17.16b, v7.16b, v23.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q2, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "sqadd v29.4s, v29.4s, v24.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x170]\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x150]\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v23.4s\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "srshl v7.4s, v7.4s, v23.4s\n"
+ "ldr q26, [x10, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v7.4s, v7.4s, v14.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v7.4s, v7.4s, v13.4s\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "smin v7.4s, v7.4s, v11.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s28, [x25, x27]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "zip2 v17.16b, v15.16b, v21.16b\n"
+ "zip1 v15.16b, v15.16b, v21.16b\n"
+ "zip1 v18.16b, v31.16b, v8.16b\n"
+ "zip2 v8.16b, v31.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "str s20, [x21, x27]\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
+ "str s29, [x24, x27]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str s25, [x23, x27]\n"
+ "zip2 v25.16b, v15.16b, v18.16b\n"
+ "str s7, [x22, x27]\n"
+ "zip1 v15.16b, v15.16b, v18.16b\n"
+ "zip1 v7.16b, v17.16b, v8.16b\n"
"add x27, x27, #0x4\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x140]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip2 v8.16b, v17.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v29.16b, v26.16b, v19.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip1 v26.16b, v26.16b, v19.16b\n"
+ "zip1 v28.16b, v4.16b, v10.16b\n"
+ "zip2 v10.16b, v4.16b, v10.16b\n"
+ "zip2 v24.16b, v22.16b, v2.16b\n"
+ "zip1 v22.16b, v22.16b, v2.16b\n"
+ "zip1 v21.16b, v3.16b, v5.16b\n"
+ "zip2 v5.16b, v3.16b, v5.16b\n"
+ "zip2 v18.16b, v27.16b, v23.16b\n"
+ "zip1 v27.16b, v27.16b, v23.16b\n"
+ "zip1 v17.16b, v30.16b, v9.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "zip2 v23.16b, v26.16b, v28.16b\n"
+ "zip1 v26.16b, v26.16b, v28.16b\n"
+ "zip1 v3.16b, v29.16b, v10.16b\n"
+ "zip2 v10.16b, v29.16b, v10.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v0.16b, v24.16b, v5.16b\n"
+ "zip2 v5.16b, v24.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v17.16b\n"
+ "zip1 v27.16b, v27.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"add x28, x28, #0x10\n"
- ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
"ldr q4, [%x[params], #0x10]\n"
- ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v27.16b, v31.16b, v4.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v27.4s\n"
+ "and v20.16b, v30.16b, v4.16b\n"
+ "and v18.16b, v29.16b, v4.16b\n"
+ "and v17.16b, v28.16b, v4.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
"ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v22.16b, v1.16b\n .inst 0x4e989596 // sdot v22.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v29.16b, v31.16b\n"
+ ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x4e9994df // sdot v31.4s, v6.16b, v25.16b\n"
+ ".inst 0x4e9794d5 // sdot v21.4s, v6.16b, v23.16b\n"
+ ".inst 0x4e97977f // sdot v31.4s, v27.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e9994dd // sdot v29.4s, v6.16b, v25.16b\n"
+ ".inst 0x4e9794d4 // sdot v20.4s, v6.16b, v23.16b\n"
+ ".inst 0x4e979592 // sdot v18.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
+ ".inst 0x4e93975f // sdot v31.4s, v26.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e939774 // sdot v20.4s, v27.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e939592 // sdot v18.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e989755 // sdot v21.4s, v26.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x4e989754 // sdot v20.4s, v26.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e999592 // sdot v18.4s, v12.16b, v25.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v4.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v29.16b, v4.16b\n"
+ "and v18.16b, v21.16b, v4.16b\n"
+ "and v17.16b, v20.16b, v4.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q26, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "ldr q25, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x90]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
"srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x80]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ "mov v22.16b, v23.16b\n .inst 0x4e829596 // sdot v22.4s, v12.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x24, x27]\n"
+ ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "str s21, [x23, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v4.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x4e87971f // sdot v31.4s, v24.16b, v7.16b\n"
+ ".inst 0x4e839704 // sdot v4.4s, v24.16b, v3.16b\n"
+ ".inst 0x4e83975f // sdot v31.4s, v26.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e879715 // sdot v21.4s, v24.16b, v7.16b\n"
+ ".inst 0x4e839714 // sdot v20.4s, v24.16b, v3.16b\n"
+ ".inst 0x4e839592 // sdot v18.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809744 // sdot v4.4s, v26.16b, v0.16b\n"
+ ".inst 0x4e80973f // sdot v31.4s, v25.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e839755 // sdot v21.4s, v26.16b, v3.16b\n"
+ ".inst 0x4e809754 // sdot v20.4s, v26.16b, v0.16b\n"
+ "mls v31.4s, v23.4s, v16.4s\n"
+ ".inst 0x4e809592 // sdot v18.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e829724 // sdot v4.4s, v25.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ ".inst 0x4e809735 // sdot v21.4s, v25.16b, v0.16b\n"
+ ".inst 0x4e829734 // sdot v20.4s, v25.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879592 // sdot v18.4s, v12.16b, v7.16b\n"
+ "mls v21.4s, v18.4s, v16.4s\n"
+ "mls v4.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v21.16b, v1.16b\n"
+ "and v18.16b, v4.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "ldr q29, [%x[params], #0x100]\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v4.4s, v4.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q26, [%x[params], #0x130]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v4.4s, v4.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v4.4s, v4.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4e8a9599 // sdot v25.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e859599 // sdot v25.4s, v12.16b, v5.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q24, [%x[params], #0xe0]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
+ "mov v23.16b, v25.16b\n .inst 0x4e899597 // sdot v23.4s, v12.16b, v9.16b\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s21, [x24, x27]\n"
+ ".inst 0x4e889599 // sdot v25.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s4, [x23, x27]\n"
+ "mov v22.16b, v24.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v21.16b, v24.16b\n"
+ "mov v20.16b, v24.16b\n"
+ ".inst 0x4e889778 // sdot v24.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e8a9775 // sdot v21.4s, v27.16b, v10.16b\n"
+ ".inst 0x4e8a97b8 // sdot v24.4s, v29.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e889776 // sdot v22.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e8a9774 // sdot v20.4s, v27.16b, v10.16b\n"
+ ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8597b5 // sdot v21.4s, v29.16b, v5.16b\n"
+ ".inst 0x4e859798 // sdot v24.4s, v28.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a97b6 // sdot v22.4s, v29.16b, v10.16b\n"
+ ".inst 0x4e8597b4 // sdot v20.4s, v29.16b, v5.16b\n"
+ "mls v24.4s, v25.4s, v16.4s\n"
+ ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e899795 // sdot v21.4s, v28.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e859796 // sdot v22.4s, v28.16b, v5.16b\n"
+ ".inst 0x4e899794 // sdot v20.4s, v28.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
+ "mls v22.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v23.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v26.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v17.16b, v20.16b, v26.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v20.4s, v20.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str s5, [x24, x27]\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x27]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x24, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s21, [x23, x27]\n"
+ "str s20, [x22, x27]\n"
"add x27, x27, #0x4\n"
"beq 35f\n"
"3:" // Oddments
@@ -869,794 +861,794 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d9, [x15], #0x8\n"
- "ldr d8, [x14], #0x8\n"
+ "ldr d15, [x15], #0x8\n"
+ "ldr d25, [x14], #0x8\n"
"ldr d7, [x13], #0x8\n"
- "ldr d6, [x12], #0x8\n"
- "ldr d4, [x10], #0x8\n"
- "ldr d3, [x9], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d1, [x25], #0x8\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d26, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d3, [x26], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.s }[2], [x15], #0x4\n"
- "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v15.s }[2], [x15], #0x4\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
"ld1 { v7.s }[2], [x13], #0x4\n"
- "ld1 { v6.s }[2], [x12], #0x4\n"
- "ld1 { v4.s }[2], [x10], #0x4\n"
- "ld1 { v3.s }[2], [x9], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v1.s }[2], [x25], #0x4\n"
+ "ld1 { v8.s }[2], [x12], #0x4\n"
+ "ld1 { v26.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v3.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.h }[6], [x15], #0x2\n"
- "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v15.h }[6], [x15], #0x2\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
"ld1 { v7.h }[6], [x13], #0x2\n"
- "ld1 { v6.h }[6], [x12], #0x2\n"
- "ld1 { v4.h }[6], [x10], #0x2\n"
- "ld1 { v3.h }[6], [x9], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v1.h }[6], [x25], #0x2\n"
+ "ld1 { v8.h }[6], [x12], #0x2\n"
+ "ld1 { v26.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v3.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[14], [x15], #0x1\n"
- "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v15.b }[14], [x15], #0x1\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
"ld1 { v7.b }[14], [x13], #0x1\n"
- "ld1 { v6.b }[14], [x12], #0x1\n"
- "ld1 { v4.b }[14], [x10], #0x1\n"
- "ld1 { v3.b }[14], [x9], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v1.b }[14], [x25], #0x1\n"
+ "ld1 { v8.b }[14], [x12], #0x1\n"
+ "ld1 { v26.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v3.b }[14], [x26], #0x1\n"
+ "ld1 { v10.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[12], [x15], #0x1\n"
- "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v15.b }[12], [x15], #0x1\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
"ld1 { v7.b }[12], [x13], #0x1\n"
- "ld1 { v6.b }[12], [x12], #0x1\n"
- "ld1 { v4.b }[12], [x10], #0x1\n"
- "ld1 { v3.b }[12], [x9], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v1.b }[12], [x25], #0x1\n"
+ "ld1 { v8.b }[12], [x12], #0x1\n"
+ "ld1 { v26.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v3.b }[12], [x26], #0x1\n"
+ "ld1 { v10.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.h }[4], [x15], #0x2\n"
- "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v15.h }[4], [x15], #0x2\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
"ld1 { v7.h }[4], [x13], #0x2\n"
- "ld1 { v6.h }[4], [x12], #0x2\n"
- "ld1 { v4.h }[4], [x10], #0x2\n"
- "ld1 { v3.h }[4], [x9], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v1.h }[4], [x25], #0x2\n"
+ "ld1 { v8.h }[4], [x12], #0x2\n"
+ "ld1 { v26.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v3.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[10], [x15], #0x1\n"
- "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v15.b }[10], [x15], #0x1\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
"ld1 { v7.b }[10], [x13], #0x1\n"
- "ld1 { v6.b }[10], [x12], #0x1\n"
- "ld1 { v4.b }[10], [x10], #0x1\n"
- "ld1 { v3.b }[10], [x9], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v1.b }[10], [x25], #0x1\n"
+ "ld1 { v8.b }[10], [x12], #0x1\n"
+ "ld1 { v26.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v3.b }[10], [x26], #0x1\n"
+ "ld1 { v10.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[8], [x15], #0x1\n"
- "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v15.b }[8], [x15], #0x1\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
"ld1 { v7.b }[8], [x13], #0x1\n"
- "ld1 { v6.b }[8], [x12], #0x1\n"
- "ld1 { v4.b }[8], [x10], #0x1\n"
- "ld1 { v3.b }[8], [x9], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v1.b }[8], [x25], #0x1\n"
+ "ld1 { v8.b }[8], [x12], #0x1\n"
+ "ld1 { v26.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v3.b }[8], [x26], #0x1\n"
+ "ld1 { v10.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s9, [x15], #0x4\n"
- "ldr s8, [x14], #0x4\n"
+ "ldr s15, [x15], #0x4\n"
+ "ldr s25, [x14], #0x4\n"
"ldr s7, [x13], #0x4\n"
- "ldr s6, [x12], #0x4\n"
- "ldr s4, [x10], #0x4\n"
- "ldr s3, [x9], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s8, [x12], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v9.h }[2], [x15], #0x2\n"
- "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v15.h }[2], [x15], #0x2\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
"ld1 { v7.h }[2], [x13], #0x2\n"
- "ld1 { v6.h }[2], [x12], #0x2\n"
- "ld1 { v4.h }[2], [x10], #0x2\n"
- "ld1 { v3.h }[2], [x9], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v8.h }[2], [x12], #0x2\n"
+ "ld1 { v26.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v3.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[6], [x15], #0x1\n"
- "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v15.b }[6], [x15], #0x1\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
"ld1 { v7.b }[6], [x13], #0x1\n"
- "ld1 { v6.b }[6], [x12], #0x1\n"
- "ld1 { v4.b }[6], [x10], #0x1\n"
- "ld1 { v3.b }[6], [x9], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v1.b }[6], [x25], #0x1\n"
+ "ld1 { v8.b }[6], [x12], #0x1\n"
+ "ld1 { v26.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v3.b }[6], [x26], #0x1\n"
+ "ld1 { v10.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[4], [x15], #0x1\n"
- "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v15.b }[4], [x15], #0x1\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
"ld1 { v7.b }[4], [x13], #0x1\n"
- "ld1 { v6.b }[4], [x12], #0x1\n"
- "ld1 { v4.b }[4], [x10], #0x1\n"
- "ld1 { v3.b }[4], [x9], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v1.b }[4], [x25], #0x1\n"
+ "ld1 { v8.b }[4], [x12], #0x1\n"
+ "ld1 { v26.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v3.b }[4], [x26], #0x1\n"
+ "ld1 { v10.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h9, [x15], #0x2\n"
- "ldr h8, [x14], #0x2\n"
+ "ldr h15, [x15], #0x2\n"
+ "ldr h25, [x14], #0x2\n"
"ldr h7, [x13], #0x2\n"
- "ldr h6, [x12], #0x2\n"
- "ldr h4, [x10], #0x2\n"
- "ldr h3, [x9], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h8, [x12], #0x2\n"
+ "ldr h26, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h10, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[2], [x15], #0x1\n"
- "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v15.b }[2], [x15], #0x1\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
"ld1 { v7.b }[2], [x13], #0x1\n"
- "ld1 { v6.b }[2], [x12], #0x1\n"
- "ld1 { v4.b }[2], [x10], #0x1\n"
- "ld1 { v3.b }[2], [x9], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v1.b }[2], [x25], #0x1\n"
+ "ld1 { v8.b }[2], [x12], #0x1\n"
+ "ld1 { v26.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x26], #0x1\n"
+ "ld1 { v10.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b9, [x15], #0x1\n"
- "ldr b8, [x14], #0x1\n"
+ "ldr b15, [x15], #0x1\n"
+ "ldr b25, [x14], #0x1\n"
"ldr b7, [x13], #0x1\n"
- "ldr b6, [x12], #0x1\n"
- "ldr b4, [x10], #0x1\n"
- "ldr b3, [x9], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b1, [x25], #0x1\n"
+ "ldr b8, [x12], #0x1\n"
+ "ldr b26, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b3, [x26], #0x1\n"
+ "ldr b10, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x15, x14, [%x[inptrs], #0x40]\n"
"ldp x13, x12, [%x[inptrs], #0x50]\n"
"add x15, x15, x28\n"
"add x14, x14, x28\n"
"ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
"add x13, x13, x28\n"
"add x12, x12, x28\n"
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d29, [x15], #0x8\n"
- "ldr d28, [x14], #0x8\n"
- "ldr d27, [x13], #0x8\n"
- "ldr d26, [x12], #0x8\n"
- "ldr d24, [x10], #0x8\n"
- "ldr d23, [x9], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d22, [x15], #0x8\n"
+ "ldr d19, [x14], #0x8\n"
+ "ldr d0, [x13], #0x8\n"
+ "ldr d5, [x12], #0x8\n"
+ "ldr d27, [x10], #0x8\n"
+ "ldr d24, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d9, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v29.s }[2], [x15], #0x4\n"
- "ld1 { v28.s }[2], [x14], #0x4\n"
- "ld1 { v27.s }[2], [x13], #0x4\n"
- "ld1 { v26.s }[2], [x12], #0x4\n"
- "ld1 { v24.s }[2], [x10], #0x4\n"
- "ld1 { v23.s }[2], [x9], #0x4\n"
- "ld1 { v22.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v22.s }[2], [x15], #0x4\n"
+ "ld1 { v19.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x13], #0x4\n"
+ "ld1 { v5.s }[2], [x12], #0x4\n"
+ "ld1 { v27.s }[2], [x10], #0x4\n"
+ "ld1 { v24.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v29.h }[6], [x15], #0x2\n"
- "ld1 { v28.h }[6], [x14], #0x2\n"
- "ld1 { v27.h }[6], [x13], #0x2\n"
- "ld1 { v26.h }[6], [x12], #0x2\n"
- "ld1 { v24.h }[6], [x10], #0x2\n"
- "ld1 { v23.h }[6], [x9], #0x2\n"
- "ld1 { v22.h }[6], [x26], #0x2\n"
- "ld1 { v21.h }[6], [x25], #0x2\n"
+ "ld1 { v22.h }[6], [x15], #0x2\n"
+ "ld1 { v19.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x13], #0x2\n"
+ "ld1 { v5.h }[6], [x12], #0x2\n"
+ "ld1 { v27.h }[6], [x10], #0x2\n"
+ "ld1 { v24.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[14], [x15], #0x1\n"
- "ld1 { v28.b }[14], [x14], #0x1\n"
- "ld1 { v27.b }[14], [x13], #0x1\n"
- "ld1 { v26.b }[14], [x12], #0x1\n"
- "ld1 { v24.b }[14], [x10], #0x1\n"
- "ld1 { v23.b }[14], [x9], #0x1\n"
- "ld1 { v22.b }[14], [x26], #0x1\n"
- "ld1 { v21.b }[14], [x25], #0x1\n"
+ "ld1 { v22.b }[14], [x15], #0x1\n"
+ "ld1 { v19.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x13], #0x1\n"
+ "ld1 { v5.b }[14], [x12], #0x1\n"
+ "ld1 { v27.b }[14], [x10], #0x1\n"
+ "ld1 { v24.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v9.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[12], [x15], #0x1\n"
- "ld1 { v28.b }[12], [x14], #0x1\n"
- "ld1 { v27.b }[12], [x13], #0x1\n"
- "ld1 { v26.b }[12], [x12], #0x1\n"
- "ld1 { v24.b }[12], [x10], #0x1\n"
- "ld1 { v23.b }[12], [x9], #0x1\n"
- "ld1 { v22.b }[12], [x26], #0x1\n"
- "ld1 { v21.b }[12], [x25], #0x1\n"
+ "ld1 { v22.b }[12], [x15], #0x1\n"
+ "ld1 { v19.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x13], #0x1\n"
+ "ld1 { v5.b }[12], [x12], #0x1\n"
+ "ld1 { v27.b }[12], [x10], #0x1\n"
+ "ld1 { v24.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v9.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v29.h }[4], [x15], #0x2\n"
- "ld1 { v28.h }[4], [x14], #0x2\n"
- "ld1 { v27.h }[4], [x13], #0x2\n"
- "ld1 { v26.h }[4], [x12], #0x2\n"
- "ld1 { v24.h }[4], [x10], #0x2\n"
- "ld1 { v23.h }[4], [x9], #0x2\n"
- "ld1 { v22.h }[4], [x26], #0x2\n"
- "ld1 { v21.h }[4], [x25], #0x2\n"
+ "ld1 { v22.h }[4], [x15], #0x2\n"
+ "ld1 { v19.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x13], #0x2\n"
+ "ld1 { v5.h }[4], [x12], #0x2\n"
+ "ld1 { v27.h }[4], [x10], #0x2\n"
+ "ld1 { v24.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[10], [x15], #0x1\n"
- "ld1 { v28.b }[10], [x14], #0x1\n"
- "ld1 { v27.b }[10], [x13], #0x1\n"
- "ld1 { v26.b }[10], [x12], #0x1\n"
- "ld1 { v24.b }[10], [x10], #0x1\n"
- "ld1 { v23.b }[10], [x9], #0x1\n"
- "ld1 { v22.b }[10], [x26], #0x1\n"
- "ld1 { v21.b }[10], [x25], #0x1\n"
+ "ld1 { v22.b }[10], [x15], #0x1\n"
+ "ld1 { v19.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x13], #0x1\n"
+ "ld1 { v5.b }[10], [x12], #0x1\n"
+ "ld1 { v27.b }[10], [x10], #0x1\n"
+ "ld1 { v24.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v9.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[8], [x15], #0x1\n"
- "ld1 { v28.b }[8], [x14], #0x1\n"
- "ld1 { v27.b }[8], [x13], #0x1\n"
- "ld1 { v26.b }[8], [x12], #0x1\n"
- "ld1 { v24.b }[8], [x10], #0x1\n"
- "ld1 { v23.b }[8], [x9], #0x1\n"
- "ld1 { v22.b }[8], [x26], #0x1\n"
- "ld1 { v21.b }[8], [x25], #0x1\n"
+ "ld1 { v22.b }[8], [x15], #0x1\n"
+ "ld1 { v19.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x13], #0x1\n"
+ "ld1 { v5.b }[8], [x12], #0x1\n"
+ "ld1 { v27.b }[8], [x10], #0x1\n"
+ "ld1 { v24.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v9.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s29, [x15], #0x4\n"
- "ldr s28, [x14], #0x4\n"
- "ldr s27, [x13], #0x4\n"
- "ldr s26, [x12], #0x4\n"
- "ldr s24, [x10], #0x4\n"
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s0, [x13], #0x4\n"
+ "ldr s5, [x12], #0x4\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s24, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s9, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v29.h }[2], [x15], #0x2\n"
- "ld1 { v28.h }[2], [x14], #0x2\n"
- "ld1 { v27.h }[2], [x13], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v24.h }[2], [x10], #0x2\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "ld1 { v22.h }[2], [x26], #0x2\n"
- "ld1 { v21.h }[2], [x25], #0x2\n"
+ "ld1 { v22.h }[2], [x15], #0x2\n"
+ "ld1 { v19.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x13], #0x2\n"
+ "ld1 { v5.h }[2], [x12], #0x2\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[6], [x15], #0x1\n"
- "ld1 { v28.b }[6], [x14], #0x1\n"
- "ld1 { v27.b }[6], [x13], #0x1\n"
- "ld1 { v26.b }[6], [x12], #0x1\n"
- "ld1 { v24.b }[6], [x10], #0x1\n"
- "ld1 { v23.b }[6], [x9], #0x1\n"
- "ld1 { v22.b }[6], [x26], #0x1\n"
- "ld1 { v21.b }[6], [x25], #0x1\n"
+ "ld1 { v22.b }[6], [x15], #0x1\n"
+ "ld1 { v19.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x13], #0x1\n"
+ "ld1 { v5.b }[6], [x12], #0x1\n"
+ "ld1 { v27.b }[6], [x10], #0x1\n"
+ "ld1 { v24.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v9.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[4], [x15], #0x1\n"
- "ld1 { v28.b }[4], [x14], #0x1\n"
- "ld1 { v27.b }[4], [x13], #0x1\n"
- "ld1 { v26.b }[4], [x12], #0x1\n"
- "ld1 { v24.b }[4], [x10], #0x1\n"
- "ld1 { v23.b }[4], [x9], #0x1\n"
- "ld1 { v22.b }[4], [x26], #0x1\n"
- "ld1 { v21.b }[4], [x25], #0x1\n"
+ "ld1 { v22.b }[4], [x15], #0x1\n"
+ "ld1 { v19.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x13], #0x1\n"
+ "ld1 { v5.b }[4], [x12], #0x1\n"
+ "ld1 { v27.b }[4], [x10], #0x1\n"
+ "ld1 { v24.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v9.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h29, [x15], #0x2\n"
- "ldr h28, [x14], #0x2\n"
- "ldr h27, [x13], #0x2\n"
- "ldr h26, [x12], #0x2\n"
- "ldr h24, [x10], #0x2\n"
- "ldr h23, [x9], #0x2\n"
- "ldr h22, [x26], #0x2\n"
- "ldr h21, [x25], #0x2\n"
+ "ldr h22, [x15], #0x2\n"
+ "ldr h19, [x14], #0x2\n"
+ "ldr h0, [x13], #0x2\n"
+ "ldr h5, [x12], #0x2\n"
+ "ldr h27, [x10], #0x2\n"
+ "ldr h24, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h9, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[2], [x15], #0x1\n"
- "ld1 { v28.b }[2], [x14], #0x1\n"
- "ld1 { v27.b }[2], [x13], #0x1\n"
- "ld1 { v26.b }[2], [x12], #0x1\n"
- "ld1 { v24.b }[2], [x10], #0x1\n"
- "ld1 { v23.b }[2], [x9], #0x1\n"
- "ld1 { v22.b }[2], [x26], #0x1\n"
- "ld1 { v21.b }[2], [x25], #0x1\n"
+ "ld1 { v22.b }[2], [x15], #0x1\n"
+ "ld1 { v19.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x13], #0x1\n"
+ "ld1 { v5.b }[2], [x12], #0x1\n"
+ "ld1 { v27.b }[2], [x10], #0x1\n"
+ "ld1 { v24.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v9.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b29, [x15], #0x1\n"
- "ldr b28, [x14], #0x1\n"
- "ldr b27, [x13], #0x1\n"
- "ldr b26, [x12], #0x1\n"
- "ldr b24, [x10], #0x1\n"
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "ldr b21, [x25], #0x1\n"
+ "ldr b22, [x15], #0x1\n"
+ "ldr b19, [x14], #0x1\n"
+ "ldr b0, [x13], #0x1\n"
+ "ldr b5, [x12], #0x1\n"
+ "ldr b27, [x10], #0x1\n"
+ "ldr b24, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b9, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v1.16b, v26.16b, v3.16b\n"
+ "zip1 v26.16b, v26.16b, v3.16b\n"
+ "ldr q4, [%x[params], #0x30]\n"
+ "zip1 v18.16b, v23.16b, v10.16b\n"
+ "zip2 v30.16b, v15.16b, v7.16b\n"
"cmp x20, #0x4\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
- ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v29.16b, v25.16b, v8.16b\n"
+ "zip2 v8.16b, v25.16b, v8.16b\n"
+ "zip2 v10.16b, v23.16b, v10.16b\n"
+ "zip2 v23.16b, v26.16b, v18.16b\n"
+ "zip1 v26.16b, v26.16b, v18.16b\n"
+ "zip2 v28.16b, v22.16b, v0.16b\n"
+ "zip1 v22.16b, v22.16b, v0.16b\n"
+ "zip1 v21.16b, v19.16b, v5.16b\n"
"movi v17.4s, #0x0\n"
- ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x50]\n"
- ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
+ ".inst 0x4e9a9591 // sdot v17.4s, v12.16b, v26.16b\n"
+ "zip2 v25.16b, v15.16b, v29.16b\n"
+ "zip1 v15.16b, v15.16b, v29.16b\n"
+ "zip1 v7.16b, v30.16b, v8.16b\n"
+ "zip2 v8.16b, v30.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v5.16b, v19.16b, v5.16b\n"
+ "zip2 v30.16b, v27.16b, v2.16b\n"
+ "zip1 v27.16b, v27.16b, v2.16b\n"
+ "zip1 v18.16b, v24.16b, v9.16b\n"
+ "zip2 v9.16b, v24.16b, v9.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v3.16b, v1.16b, v10.16b\n"
+ ".inst 0x4e969591 // sdot v17.4s, v12.16b, v22.16b\n"
+ "zip2 v10.16b, v1.16b, v10.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v18.16b\n"
+ "zip1 v27.16b, v27.16b, v18.16b\n"
+ "zip1 v2.16b, v30.16b, v9.16b\n"
+ "mov v18.16b, v17.16b\n .inst 0x4e9b9592 // sdot v18.4s, v12.16b, v27.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ ".inst 0x4e8f9591 // sdot v17.4s, v12.16b, v15.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e8f969f // sdot v31.4s, v20.16b, v15.16b\n"
+ ".inst 0x4e9a969d // sdot v29.4s, v20.16b, v26.16b\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "movi v1.4s, #0x0\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9581 // sdot v1.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96949f // sdot v31.4s, v4.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f969e // sdot v30.4s, v20.16b, v15.16b\n"
+ ".inst 0x4e9a969c // sdot v28.4s, v20.16b, v26.16b\n"
+ "mls v31.4s, v17.4s, v16.4s\n"
+ ".inst 0x4e969581 // sdot v1.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b949d // sdot v29.4s, v4.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mov v20.16b, v1.16b\n .inst 0x4e9b9594 // sdot v20.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9581 // sdot v1.4s, v12.16b, v15.16b\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ ".inst 0x4e96949e // sdot v30.4s, v4.16b, v22.16b\n"
+ ".inst 0x4e9b949c // sdot v28.4s, v4.16b, v27.16b\n"
+ "mls v30.4s, v1.4s, v16.4s\n"
"add %x[params], %x[params], #0x60\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "mls v28.4s, v20.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v26.16b, v28.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v26.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 20f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 21f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
- ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q4, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e99977f // sdot v31.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
+ "movi v20.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
+ ".inst 0x4e97975f // sdot v31.4s, v26.16b, v23.16b\n"
+ "mov v18.16b, v1.16b\n .inst 0x4e989592 // sdot v18.4s, v12.16b, v24.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e99977e // sdot v30.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e97977c // sdot v28.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e979594 // sdot v20.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x4e9396df // sdot v31.4s, v22.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97975e // sdot v30.4s, v26.16b, v23.16b\n"
+ ".inst 0x4e93975c // sdot v28.4s, v26.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e939594 // sdot v20.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e9896dd // sdot v29.4s, v22.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e9396de // sdot v30.4s, v22.16b, v19.16b\n"
+ ".inst 0x4e9896dc // sdot v28.4s, v22.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "mov v17.16b, v20.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e999594 // sdot v20.4s, v12.16b, v25.16b\n"
+ "mls v30.4s, v20.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v28.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 24f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 25f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x4e839598 // sdot v24.4s, v12.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e87973f // sdot v31.4s, v25.16b, v7.16b\n"
+ ".inst 0x4e809598 // sdot v24.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e83973d // sdot v29.4s, v25.16b, v3.16b\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
- ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
- "movi v17.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
+ "mov v18.16b, v24.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ ".inst 0x4e879598 // sdot v24.4s, v12.16b, v7.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ ".inst 0x4e87973e // sdot v30.4s, v25.16b, v7.16b\n"
+ ".inst 0x4e83973c // sdot v28.4s, v25.16b, v3.16b\n"
+ ".inst 0x4e839593 // sdot v19.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e8096fd // sdot v29.4s, v23.16b, v0.16b\n"
+ ".inst 0x4e8096df // sdot v31.4s, v22.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e8396fe // sdot v30.4s, v23.16b, v3.16b\n"
+ ".inst 0x4e8096fc // sdot v28.4s, v23.16b, v0.16b\n"
+ "mls v31.4s, v24.4s, v16.4s\n"
+ ".inst 0x4e809593 // sdot v19.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e8296dd // sdot v29.4s, v22.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e8096de // sdot v30.4s, v22.16b, v0.16b\n"
+ ".inst 0x4e8296dc // sdot v28.4s, v22.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "mov v17.16b, v19.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879593 // sdot v19.4s, v12.16b, v7.16b\n"
+ "mls v30.4s, v19.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v28.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 28f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 29f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
- ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e8a9596 // sdot v22.4s, v12.16b, v10.16b\n"
+ "ldr q21, [%x[params], #0x20]\n"
+ "ldr q19, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e8896ff // sdot v31.4s, v23.16b, v8.16b\n"
+ ".inst 0x4e859596 // sdot v22.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e8a96fd // sdot v29.4s, v23.16b, v10.16b\n"
+ "movi v18.4s, #0x0\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
+ ".inst 0x4e8a96bf // sdot v31.4s, v21.16b, v10.16b\n"
+ "mov v17.16b, v22.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x4e889596 // sdot v22.4s, v12.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e8896fe // sdot v30.4s, v23.16b, v8.16b\n"
+ ".inst 0x4e8a96fc // sdot v28.4s, v23.16b, v10.16b\n"
+ ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8596bd // sdot v29.4s, v21.16b, v5.16b\n"
+ ".inst 0x4e85967f // sdot v31.4s, v19.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a96be // sdot v30.4s, v21.16b, v10.16b\n"
+ ".inst 0x4e8596bc // sdot v28.4s, v21.16b, v5.16b\n"
+ "mls v31.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e89967d // sdot v29.4s, v19.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e85967e // sdot v30.4s, v19.16b, v5.16b\n"
+ ".inst 0x4e89967c // sdot v28.4s, v19.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mov v7.16b, v18.16b\n .inst 0x4e899587 // sdot v7.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v28.4s, v7.4s, v16.4s\n"
+ "and v16.16b, v31.16b, v26.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v17.16b, v29.16b, v26.16b\n"
+ "and v16.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 33f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 411b4788d8..a679b02f7c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- int8_t *const *const);
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index d69d0e1ef2..a181603f1e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1072 +91,1072 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x7, x6, #0x3\n"
+ "lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v24.16b }, [x20]\n"
+ "ld1r { v14.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
- "mov x8, #0x0\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x12, x11, [x22, #0x0]\n"
- "ldp x10, x9, [x22, #0x10]\n"
- "cbz x7, 3f\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "subs x7, x7, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d31, [x24, x8]\n"
- "ldr d30, [x23, x8]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d29, [x22, x8]\n"
- "ldr d28, [x21, x8]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr x20, [x16, #0x20]\n"
- "ldr d27, [x20, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ssubl v27.8h, v27.8b, v24.8b\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x15, x15, #0x48\n"
- "subs x7, x7, #0x1\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
"add x13, x13, #0x20\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "ssubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "add x17, x17, #0x8\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr d31, [x24, x8]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x23, x8]\n"
- "ldr d29, [x22, x8]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ldr d28, [x21, x8]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ldr d27, [x20, x8]\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ssubl v27.8h, v27.8b, v24.8b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "tst x6, #0x7\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
"add x13, x13, #0x20\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "ssubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "add x17, x17, #0x8\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x15, x15, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x6, #2, 5f\n"
- "ld1 { v13.4s }, [x28], #0x10\n"
- "tbz x6, #1, 4f\n"
- "ld1 { v20.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[2], [x28]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x6, #1, 6f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[2], [x28]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "add x24, x24, x8\n"
- "add x23, x23, x8\n"
- "add x22, x22, x8\n"
- "add x21, x21, x8\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x6, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "add x21, x21, x8\n"
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ssubl v27.8h, v27.8b, v24.8b\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "tbz x6, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x6, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "add x20, x20, x8\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "tbz x6, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ssubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x6, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "add x28, x28, x8\n"
- "tbz x6, #2, 21f\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "tbz x6, #1, 20f\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[6], [x28]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[4], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x6, #1, 22f\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[2], [x28]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[0], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ldr x27, [x16, #0x40]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "add x27, x27, x8\n"
- "tbz x6, #2, 25f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x6, #1, 24f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x6, #1, 26f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "add x26, x26, x8\n"
- "tbz x6, #2, 29f\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "tbz x6, #1, 28f\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[6], [x26]\n"
+ "ssubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[4], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x6, #1, 30f\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[2], [x26]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[0], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "add x25, x25, x8\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "tbz x6, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x6, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x6, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x24, [x16, #0x58]\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "add x24, x24, x8\n"
- "tbz x6, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x6, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x6, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "add x23, x23, x8\n"
- "tbz x6, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x6, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x6, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v24.8b\n"
- "ldr x22, [x16, #0x68]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "add x22, x22, x8\n"
- "tbz x6, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x6, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ssubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x6, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v30.8h, v30.8b, v24.8b\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "add x21, x21, x8\n"
- "tbz x6, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "ssubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x6, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v29.8h, v29.8b, v24.8b\n"
- "ldr x20, [x16, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x6, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "tbz x6, #2, 57f\n"
- "ld1 { v17.4s }, [x14], #0x10\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "tbz x6, #1, 56f\n"
- "ld1 { v23.d }[0], [x14], #0x8\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ssubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[0], [x14]\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x6, #1, 58f\n"
- "ld1 { v17.d }[0], [x14], #0x8\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[2], [x14]\n"
- "ld1 { v22.s }[2], [x13]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[0], [x14]\n"
- "ld1 { v22.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "add x12, x12, x17\n"
- "add x11, x11, x17\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x10, x10, x17\n"
- "add x9, x9, x17\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "tbz x6, #2, 61f\n"
- "st1 { v13.s }[0], [x12], #0x4\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
"st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v25.s }[0], [x9], #0x4\n"
- "tbz x6, #1, 60f\n"
- "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
"st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v25.h }[2], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v25.b }[6], [x9], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[4], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v25.b }[4], [x9], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x6, #1, 62f\n"
- "st1 { v13.h }[0], [x12], #0x2\n"
+ "tbz x7, #1, 62f\n"
"st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v25.h }[0], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v25.b }[2], [x9], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[0], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v25.b }[0], [x9], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 852466c48d..7370f89699 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- int8_t *const *const);
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index fa9ae97dee..6432417c35 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,16 +104,16 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
@@ -123,563 +123,563 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "ssubl v31.8h, v31.8b, v12.8b\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "ssubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
"add x14, x14, #0x48\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
"subs x8, x8, #0x1\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
"add x13, x13, #0x20\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
"sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ssubl v31.8h, v31.8b, v12.8b\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr d24, [x20, x17]\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr d25, [x27, x17]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "ssubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
"tst x7, #0x7\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v17.d }[0], [x24], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v3.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[0], [x24]\n"
+ "ld1 { v3.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x24]\n"
+ "ld1 { v5.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -689,700 +689,700 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v12.8b\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "ssubl v27.8h, v27.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v26.8h, v26.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v25.8h, v25.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x24, x24, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "add x25, x25, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ssubl v25.8h, v25.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v10.s }[0], [x10], #0x4\n"
- "st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v21.s }[0], [x28], #0x4\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v10.h }[2], [x10], #0x2\n"
- "st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v21.h }[2], [x28], #0x2\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v10.b }[6], [x10], #0x1\n"
- "st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v21.b }[6], [x28], #0x1\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v10.b }[4], [x10], #0x1\n"
- "st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v21.b }[4], [x28], #0x1\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v10.h }[0], [x10], #0x2\n"
- "st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v21.h }[0], [x28], #0x2\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v10.b }[2], [x10], #0x1\n"
- "st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v21.b }[2], [x28], #0x1\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v10.b }[0], [x10], #0x1\n"
- "st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v21.b }[0], [x28], #0x1\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index e60597d390..65ebe627ef 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- int8_t *const *const);
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 4b0ad00187..1dc0f33186 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -112,1188 +112,1188 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x2, x1, #0x3\n"
- "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v9.16b }, [x3]\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
- "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x11]\n"
- "ld1r { v14.8h }, [x5]\n"
- "add x3, x13, %[offsetof_Requantize32_minval]\n"
- "add x15, x13, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x3]\n"
- "ld1r { v11.8h }, [x15]\n"
- "mov x0, #0x0\n"
- "mov x10, #0x0\n"
- "add x4, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x6, [x24, #0x0]\n"
- "ldp x7, x16, [x24, #0x10]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
"cbz x2, 3f\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
"subs x2, x2, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ldr d31, [x9, x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldr d30, [x28, x0]\n"
- "ldr d29, [x27, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x26, x0]\n"
- "ldr d27, [x25, x0]\n"
- "ssubl v29.8h, v29.8b, v9.8b\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x24, x0]\n"
- "ldr d25, [x23, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x22, x0]\n"
- "ldr d26, [x21, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x20, x0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
"subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
"smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x25, [x4, #0xf0]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
"add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "add x3, x3, #0xc8\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "add x10, x10, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d31, [x9, x0]\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d29, [x27, x0]\n"
- "ldr d28, [x26, x0]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr d27, [x25, x0]\n"
- "ldr d23, [x24, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v29.8h, v29.8b, v9.8b\n"
- "ldr d25, [x23, x0]\n"
- "ldr d24, [x22, x0]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr d26, [x21, x0]\n"
- "ldr d22, [x20, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "tst x1, #0x7\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "add x5, x5, #0x20\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
"smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
"smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
"sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "add x10, x10, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x6, x6, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v13.4s }, [x13], #0x10\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v15.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ld1 { v7.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "ld1 { v7.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "add x9, x9, x0\n"
- "add x28, x28, x0\n"
- "add x27, x27, x0\n"
- "add x26, x26, x0\n"
- "add x25, x25, x0\n"
- "add x24, x24, x0\n"
- "add x23, x23, x0\n"
- "add x22, x22, x0\n"
- "add x21, x21, x0\n"
- "add x20, x20, x0\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 9f\n"
"ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
"ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
"ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
"ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[6], [x9]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
"ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[4], [x9]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
"ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
"ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
"ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[2], [x9]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
"ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[0], [x9]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
"ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "ssubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "add x20, x20, x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr x22, [x4, #0x58]\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "add x22, x22, x0\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
"tbz x1, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "add x21, x21, x0\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x3, #0x28]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x4, #0x68]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "ldr d14, [x6, #0x28]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
"tbz x1, #2, 25f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
@@ -1315,869 +1315,869 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 27f\n"
"ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "ldr d21, [x6, #0x30]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x25, [x4, #0x78]\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "ldr d9, [x6, #0x38]\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "ldr d31, [x6, #0x40]\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x24, [x4, #0x88]\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "add x24, x24, x0\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "ldr d16, [x6, #0x48]\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x4, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x0\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "add x14, x14, x0\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "ldr d2, [x6, #0x58]\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x4, #0xa8]\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "add x13, x13, x0\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "ldr d25, [x6, #0x60]\n"
+ "ssubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x12, x12, x0\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0xb8]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d16, [x6, #0x70]\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v14.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v14.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v14.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v14.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "ssubl v22.8h, v22.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "add x11, x11, x0\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d17, [x6, #0x78]\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr x22, [x4, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x22, x22, x0\n"
+ "ssubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 73f\n"
- "ld1 { v23.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v23.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "add x9, x9, x0\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x28, [x4, #0xd8]\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "add x28, x28, x0\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "ldr d12, [x6, #0x88]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "add x27, x27, x0\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "ldr d21, [x6, #0x90]\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "ldr d8, [x6, #0x98]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "ldr d9, [x6, #0xa0]\n"
+ "ssubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
"tbz x1, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x1, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x0\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d12, [x6, #0xa8]\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d28, [x6, #0xb0]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x21, x21, x0\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d30, [x6, #0xb8]\n"
+ "ssubl v2.8h, v2.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d8, [x6, #0xc0]\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v18.4s }, [x5], #0x10\n"
- "ld1 { v6.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v5.d }[0], [x5], #0x8\n"
- "ld1 { v22.d }[0], [x8], #0x8\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[2], [x5]\n"
- "ld1 { v22.s }[2], [x8]\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[0], [x5]\n"
- "ld1 { v22.s }[0], [x8]\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v6.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v6.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v6.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "add x17, x17, x10\n"
- "add x6, x6, x10\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x7, x7, x10\n"
- "add x16, x16, x10\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v13.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x7], #0x4\n"
- "st1 { v17.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v13.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x7], #0x2\n"
- "st1 { v17.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x7], #0x1\n"
- "st1 { v17.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x7], #0x1\n"
- "st1 { v17.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v13.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x7], #0x2\n"
- "st1 { v17.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x7], #0x1\n"
- "st1 { v17.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x7], #0x1\n"
- "st1 { v17.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
index 9b1f7c239f..9c92a9dd46 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -47,4 +47,5 @@ class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKern
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 3f345cf95a..77b7d231e0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
@@ -59,7 +60,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 6f\n"
+ "cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
@@ -67,34 +68,34 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr s16, [x28, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x26, x11]\n"
- "ldr s19, [x25, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr s20, [x24, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"ssubl v15.8h, v15.8b, v6.8b\n"
"ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"ssubl v17.8h, v17.8b, v6.8b\n"
"ssubl v18.8h, v18.8b, v6.8b\n"
"ssubl v19.8h, v19.8b, v6.8b\n"
@@ -103,35 +104,35 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ssubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x28, x11]\n"
- "ldr s17, [x27, x11]\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldr s18, [x26, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x20, x20, #0x1\n"
+ "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x24, x11]\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"ssubl v15.8h, v15.8b, v6.8b\n"
"ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"ssubl v17.8h, v17.8b, v6.8b\n"
"ssubl v18.8h, v18.8b, v6.8b\n"
"ssubl v19.8h, v19.8b, v6.8b\n"
@@ -167,45 +168,45 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v24.4s, v24.4s, v2.4s\n"
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -270,7 +271,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"str s30, [x21, x11]\n"
"str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x12, LSL #2\n"
+ "cmp x11, x9, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -288,61 +289,61 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Bit 1: End
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
+ "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x28, x28, x11\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
@@ -356,62 +357,62 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ssubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x22, [x21], #0x8\n"
- "add x10, x10, x11\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
@@ -457,9 +458,7 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"cbz %x[rq_left_shift_ptr], 19f\n"
"ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
-
"20:" // Oddments: Load quantisation parameters: Bit 1: End
-
"21:" // Oddments: Load quantisation parameters: Done
"sshl v23.4s, v23.4s, v3.4s\n"
"sshl v24.4s, v24.4s, v3.4s\n"
@@ -473,11 +472,11 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"add x28, x28, x11\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
@@ -490,36 +489,36 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x21, x21, x11\n"
"add x20, x20, x11\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -606,15 +605,14 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.b }[0], [x21], #0x1\n"
"st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
-
"24:" // End
-
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 5ca3ccd4bf..14adf8880f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -58,4 +58,4 @@ struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 342a297dd4..be8fbfa0e2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -40,169 +41,169 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q14, [%x[params], #0x0]\n"
+ "ldr q11, [%x[params], #0x0]\n"
"ldr q5, [%x[params], #0x10]\n"
- "movi v15.16b, #0x1\n"
- "ushr v15.4s, v15.4s, #0x8\n"
+ "movi v8.16b, #0x1\n"
+ "ushr v8.4s, v8.4s, #0x8\n"
"ldr q6, [%x[params], #0x20]\n"
"ldr q7, [%x[params], #0x30]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ld1 { v1.16b }, [x20]\n"
- "mov v29.16b, v1.16b\n"
- "mov v16.16b, v1.16b\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v23.16b, v1.16b\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1 { v2.16b }, [x20]\n"
- "mov v28.16b, v1.16b\n"
- "mov v22.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v21.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x20]\n"
"ld1 { v4.16b }, [x20]\n"
- "mov v31.16b, v2.16b\n"
- "mov v30.16b, v2.16b\n"
+ "mov v20.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1 { v0.16b }, [x20]\n"
- "mov v23.16b, v4.16b\n"
- "mov v21.16b, v4.16b\n"
+ "mov v9.16b, v4.16b\n"
+ "mov v22.16b, v4.16b\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1 { v3.16b }, [x20]\n"
- "mov v20.16b, v4.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x2\n"
- "ext v16.16b, v16.16b, v16.16b, #0x4\n"
- "ext v28.16b, v28.16b, v28.16b, #0x6\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x2\n"
- "ext v31.16b, v31.16b, v31.16b, #0x4\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
"ld1r { v12.4s }, [x20]\n"
- "ext v30.16b, v30.16b, v30.16b, #0x6\n"
- "ext v23.16b, v23.16b, v23.16b, #0x2\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x4\n"
- "ext v20.16b, v20.16b, v20.16b, #0x6\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v10.4s }, [x20]\n"
- "mov v25.16b, v0.16b\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov v27.16b, v0.16b\n"
"mov v19.16b, v0.16b\n"
"cmp %x[n_channels], #0x4\n"
"mov x9, #0x0\n"
"mov v18.16b, v0.16b\n"
- "mov v24.16b, v3.16b\n"
+ "mov v26.16b, v3.16b\n"
"mov x28, #0x0\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"mov v17.16b, v3.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x2\n"
+ "mov v16.16b, v3.16b\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x2\n"
"ext v19.16b, v19.16b, v19.16b, #0x4\n"
- "ext v18.16b, v18.16b, v18.16b, #0x6\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "zip1 v1.4s, v1.4s, v16.4s\n"
- "mov v16.16b, v3.16b\n"
- "zip1 v29.4s, v29.4s, v28.4s\n"
- "zip1 v2.4s, v2.4s, v31.4s\n"
- "zip1 v22.4s, v22.4s, v30.4s\n"
- "ext v24.16b, v24.16b, v24.16b, #0x2\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "zip1 v1.4s, v1.4s, v23.4s\n"
+ "zip1 v28.4s, v28.4s, v30.4s\n"
+ "zip1 v2.4s, v2.4s, v20.4s\n"
+ "zip1 v21.4s, v21.4s, v29.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x2\n"
"ext v17.16b, v17.16b, v17.16b, #0x4\n"
"ext v16.16b, v16.16b, v16.16b, #0x6\n"
- "zip1 v4.4s, v4.4s, v21.4s\n"
- "zip1 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v4.4s, v4.4s, v22.4s\n"
+ "zip1 v9.4s, v9.4s, v31.4s\n"
"zip1 v0.4s, v0.4s, v19.4s\n"
- "zip1 v25.4s, v25.4s, v18.4s\n"
- "zip1 v1.4s, v1.4s, v29.4s\n"
- "zip1 v2.4s, v2.4s, v22.4s\n"
- ".inst 0x4f81e1fa // sdot v26.4s, v15.16b, v1.4b[0]\n"
+ "zip1 v27.4s, v27.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v21.4s\n"
+ ".inst 0x4f81e118 // sdot v24.4s, v8.16b, v1.4b[0]\n"
"zip1 v3.4s, v3.4s, v17.4s\n"
- "zip1 v24.4s, v24.4s, v16.4s\n"
- ".inst 0x4fa1e1fb // sdot v27.4s, v15.16b, v1.4b[1]\n"
- "zip1 v4.4s, v4.4s, v23.4s\n"
+ "zip1 v26.4s, v26.4s, v16.4s\n"
+ ".inst 0x4fa1e119 // sdot v25.4s, v8.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v9.4s\n"
"movi v23.4s, #0x0\n"
- ".inst 0x4f81e9f7 // sdot v23.4s, v15.16b, v1.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
"movi v22.4s, #0x0\n"
"movi v21.4s, #0x0\n"
- ".inst 0x4fa1e9f6 // sdot v22.4s, v15.16b, v1.4b[3]\n"
- "movi v20.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x4f82e1f5 // sdot v21.4s, v15.16b, v2.4b[0]\n"
- "movi v8.4s, #0x0\n"
+ ".inst 0x4fa1e916 // sdot v22.4s, v8.16b, v1.4b[3]\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4fa2e1f4 // sdot v20.4s, v15.16b, v2.4b[1]\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x4f82e115 // sdot v21.4s, v8.16b, v2.4b[0]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x4fa2e113 // sdot v19.4s, v8.16b, v2.4b[1]\n"
"movi v18.4s, #0x0\n"
"movi v17.4s, #0x0\n"
- ".inst 0x4f82e9e9 // sdot v9.4s, v15.16b, v2.4b[2]\n"
+ ".inst 0x4f82e909 // sdot v9.4s, v8.16b, v2.4b[2]\n"
"movi v16.4s, #0x0\n"
- "zip1 v0.4s, v0.4s, v25.4s\n"
- ".inst 0x4fa2e9e8 // sdot v8.4s, v15.16b, v2.4b[3]\n"
- "zip1 v3.4s, v3.4s, v24.4s\n"
- ".inst 0x4f84e1f3 // sdot v19.4s, v15.16b, v4.4b[0]\n"
- ".inst 0x4fa4e1f2 // sdot v18.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4f84e9f1 // sdot v17.4s, v15.16b, v4.4b[2]\n"
- ".inst 0x4fa4e9f0 // sdot v16.4s, v15.16b, v4.4b[3]\n"
+ "zip1 v0.4s, v0.4s, v27.4s\n"
+ ".inst 0x4fa2e90a // sdot v10.4s, v8.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v26.4s\n"
+ ".inst 0x4f84e114 // sdot v20.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e112 // sdot v18.4s, v8.16b, v4.4b[1]\n"
+ ".inst 0x4f84e911 // sdot v17.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e910 // sdot v16.4s, v8.16b, v4.4b[3]\n"
"movi v31.4s, #0x0\n"
"movi v30.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- ".inst 0x4f80e1ff // sdot v31.4s, v15.16b, v0.4b[0]\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x4f80e11f // sdot v31.4s, v8.16b, v0.4b[0]\n"
+ "movi v27.4s, #0x0\n"
"movi v28.4s, #0x0\n"
- ".inst 0x4fa0e1fe // sdot v30.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4f80e9fd // sdot v29.4s, v15.16b, v0.4b[2]\n"
- ".inst 0x4fa0e9fc // sdot v28.4s, v15.16b, v0.4b[3]\n"
- "add v24.4s, v26.4s, v21.4s\n"
- "add v25.4s, v27.4s, v20.4s\n"
- "add v26.4s, v23.4s, v9.4s\n"
- "add v27.4s, v22.4s, v8.4s\n"
- "add v23.4s, v19.4s, v21.4s\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x4f83e1f6 // sdot v22.4s, v15.16b, v3.4b[0]\n"
- "add v21.4s, v18.4s, v20.4s\n"
+ ".inst 0x4fa0e11e // sdot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x4f80e91a // sdot v26.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e91b // sdot v27.4s, v8.16b, v0.4b[3]\n"
+ ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e11d // sdot v29.4s, v8.16b, v3.4b[1]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "add v21.4s, v20.4s, v21.4s\n"
"movi v20.4s, #0x0\n"
- ".inst 0x4fa3e1f4 // sdot v20.4s, v15.16b, v3.4b[1]\n"
- "add v19.4s, v17.4s, v9.4s\n"
+ ".inst 0x4f83e914 // sdot v20.4s, v8.16b, v3.4b[2]\n"
+ "add v19.4s, v18.4s, v19.4s\n"
"movi v18.4s, #0x0\n"
- ".inst 0x4f83e9f2 // sdot v18.4s, v15.16b, v3.4b[2]\n"
- "add v17.4s, v16.4s, v8.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4fa3e9f0 // sdot v16.4s, v15.16b, v3.4b[3]\n"
+ ".inst 0x4fa3e912 // sdot v18.4s, v8.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v9.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
"add v24.4s, v24.4s, v31.4s\n"
"add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v27.4s, v27.4s, v28.4s\n"
- "add v28.4s, v23.4s, v22.4s\n"
- "add v29.4s, v21.4s, v20.4s\n"
- "add v30.4s, v19.4s, v18.4s\n"
- "add v31.4s, v17.4s, v16.4s\n"
- "neg v13.4s, v13.4s\n"
- "mul v24.4s, v24.4s, v13.4s\n"
- "mul v25.4s, v25.4s, v13.4s\n"
- "mul v26.4s, v26.4s, v13.4s\n"
- "mul v27.4s, v27.4s, v13.4s\n"
- "mul v28.4s, v28.4s, v13.4s\n"
- "mul v29.4s, v29.4s, v13.4s\n"
- "mul v30.4s, v30.4s, v13.4s\n"
- "mul v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v23.4s, v26.4s\n"
+ "add v27.4s, v22.4s, v27.4s\n"
+ "add v28.4s, v21.4s, v28.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v17.4s, v20.4s\n"
+ "add v31.4s, v16.4s, v18.4s\n"
+ "neg v12.4s, v12.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
"zip1 v19.4s, v24.4s, v26.4s\n"
"zip1 v18.4s, v25.4s, v27.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
"zip1 v16.4s, v29.4s, v31.4s\n"
"zip1 v22.4s, v19.4s, v18.4s\n"
"zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x0]\n"
- "ldr q20, [%x[params], #0x10]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "ldr q21, [%x[params], #0x10]\n"
".inst 0x4f80e0b8 // sdot v24.4s, v5.16b, v0.4b[0]\n"
".inst 0x4fa0e0b9 // sdot v25.4s, v5.16b, v0.4b[1]\n"
- "ldr q14, [%x[params], #0x20]\n"
+ "ldr q20, [%x[params], #0x20]\n"
".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -219,43 +220,43 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr q5, [%x[params], #0x30]\n"
".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
"ldr q6, [%x[params], #0x40]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v20.16b\n"
+ "and v19.16b, v24.16b, v21.16b\n"
".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
"ldr q7, [%x[params], #0x50]\n"
- "and v18.16b, v25.16b, v20.16b\n"
- "and v17.16b, v26.16b, v20.16b\n"
- "and v16.16b, v27.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
"add %x[params], %x[params], #0x60\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
@@ -264,38 +265,38 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -329,14 +330,14 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x0]\n"
@@ -415,30 +416,30 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -514,4 +515,5 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 0641229aa7..62b033f48d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
-#if defined(__aarch64__)
-
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -57,4 +57,5 @@ struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 9fa38c6efe..17afc92e30 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -42,133 +43,133 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
__asm__ __volatile__(
"ldr q12, [%x[params], #0x0]\n"
"ldr q8, [%x[params], #0x10]\n"
- "movi v28.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
+ "movi v30.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
"ldr q9, [%x[params], #0x20]\n"
"ldr q10, [%x[params], #0x30]\n"
- "movi v31.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr q11, [%x[params], #0x40]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "movi v30.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"ld1 { v3.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "mov v16.16b, v3.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "mov v26.16b, v3.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v4.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
- "mov v15.16b, v4.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ "mov v21.16b, v4.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
"ld1 { v2.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
- "mov v20.16b, v2.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "mov v27.16b, v2.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
"ld1 { v1.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x28]\n"
- "zip1 v3.2d, v3.2d, v16.2d\n"
- "zip1 v4.2d, v4.2d, v15.2d\n"
+ "zip1 v3.2d, v3.2d, v26.2d\n"
+ "zip1 v4.2d, v4.2d, v21.2d\n"
"ld1 { v5.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x30]\n"
"mov v26.16b, v1.16b\n"
- "mov v13.16b, v5.16b\n"
+ "mov v22.16b, v5.16b\n"
"ld1 { v6.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x38]\n"
"mov v19.16b, v6.16b\n"
"ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v7.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x0]\n"
- "mov v17.16b, v7.16b\n"
- "zip1 v2.2d, v2.2d, v20.2d\n"
+ "mov v21.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v27.2d\n"
"ld1 { v0.16b }, [x20]\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
"ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4f83e392 // sdot v18.4s, v28.16b, v3.4b[0]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x4f83eb9f // sdot v31.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x4f84e398 // sdot v24.4s, v28.16b, v4.4b[0]\n"
+ ".inst 0x4f83e3d1 // sdot v17.4s, v30.16b, v3.4b[0]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4f83ebd0 // sdot v16.4s, v30.16b, v3.4b[2]\n"
+ ".inst 0x4f84e3d9 // sdot v25.4s, v30.16b, v4.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v23.4s }, [x20]\n"
- ".inst 0x4f84eb9e // sdot v30.4s, v28.16b, v4.4b[2]\n"
- "mov v16.16b, v0.16b\n"
- ".inst 0x4f82e395 // sdot v21.4s, v28.16b, v2.4b[0]\n"
- "movi v20.4s, #0x0\n"
- "movi v29.4s, #0x1\n"
- ".inst 0x4f82eb94 // sdot v20.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f84ebd8 // sdot v24.4s, v30.16b, v4.4b[2]\n"
+ "mov v18.16b, v0.16b\n"
+ ".inst 0x4f82e3df // sdot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v28.4s, #0x1\n"
+ ".inst 0x4f82ebdd // sdot v29.4s, v30.16b, v2.4b[2]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
"zip1 v1.2d, v1.2d, v26.2d\n"
- ".inst 0x4fa3e3b2 // sdot v18.4s, v29.16b, v3.4b[1]\n"
- "zip1 v5.2d, v5.2d, v13.2d\n"
+ ".inst 0x4fa3e391 // sdot v17.4s, v28.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v22.2d\n"
"zip1 v6.2d, v6.2d, v19.2d\n"
- ".inst 0x4fa3ebbf // sdot v31.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa3eb90 // sdot v16.4s, v28.16b, v3.4b[3]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
- "zip1 v7.2d, v7.2d, v17.2d\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v21.2d\n"
"movi v22.4s, #0x0\n"
- ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
- "movi v26.4s, #0x0\n"
- ".inst 0x4fa4ebbe // sdot v30.4s, v29.16b, v4.4b[3]\n"
- ".inst 0x4f81e396 // sdot v22.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4fa4eb98 // sdot v24.4s, v28.16b, v4.4b[3]\n"
+ ".inst 0x4f81e3d6 // sdot v22.4s, v30.16b, v1.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
"ld1r { v15.4s }, [x20]\n"
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- ".inst 0x4f81eb9a // sdot v26.4s, v28.16b, v1.4b[2]\n"
- "zip1 v0.2d, v0.2d, v16.2d\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ "movi v20.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4f85e399 // sdot v25.4s, v28.16b, v5.4b[0]\n"
+ ".inst 0x4f85e3da // sdot v26.4s, v30.16b, v5.4b[0]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x4f85eb9b // sdot v27.4s, v28.16b, v5.4b[2]\n"
- ".inst 0x4f86e393 // sdot v19.4s, v28.16b, v6.4b[0]\n"
- "add v24.4s, v18.4s, v24.4s\n"
- "mov x9, #0x0\n"
+ "zip1 v0.2d, v0.2d, v18.2d\n"
"movi v18.4s, #0x0\n"
- ".inst 0x4f86eb92 // sdot v18.4s, v28.16b, v6.4b[2]\n"
- ".inst 0x4fa2e3b5 // sdot v21.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4f85ebdb // sdot v27.4s, v30.16b, v5.4b[2]\n"
+ "mov x9, #0x0\n"
+ ".inst 0x4f86e3d4 // sdot v20.4s, v30.16b, v6.4b[0]\n"
+ ".inst 0x4f86ebd3 // sdot v19.4s, v30.16b, v6.4b[2]\n"
+ "add v17.4s, v17.4s, v25.4s\n"
"mov x28, #0x0\n"
- ".inst 0x4fa2ebb4 // sdot v20.4s, v29.16b, v2.4b[3]\n"
- "add v17.4s, v31.4s, v30.4s\n"
- ".inst 0x4fa1e3b6 // sdot v22.4s, v29.16b, v1.4b[1]\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4f87e3d2 // sdot v18.4s, v30.16b, v7.4b[0]\n"
+ ".inst 0x4f87ebd9 // sdot v25.4s, v30.16b, v7.4b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f87e390 // sdot v16.4s, v28.16b, v7.4b[0]\n"
- ".inst 0x4fa1ebba // sdot v26.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e39f // sdot v31.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa2eb9d // sdot v29.4s, v28.16b, v2.4b[3]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- ".inst 0x4fa5e3b9 // sdot v25.4s, v29.16b, v5.4b[1]\n"
- ".inst 0x4fa5ebbb // sdot v27.4s, v29.16b, v5.4b[3]\n"
- "add v30.4s, v21.4s, v24.4s\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x4f80e3d8 // sdot v24.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x4fa1e396 // sdot v22.4s, v28.16b, v1.4b[1]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- ".inst 0x4fa6e3b3 // sdot v19.4s, v29.16b, v6.4b[1]\n"
- ".inst 0x4fa6ebb2 // sdot v18.4s, v29.16b, v6.4b[3]\n"
- "add v31.4s, v20.4s, v17.4s\n"
+ ".inst 0x4fa1eb95 // sdot v21.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa5e39a // sdot v26.4s, v28.16b, v5.4b[1]\n"
+ "add v31.4s, v31.4s, v17.4s\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- ".inst 0x4fa7e3b0 // sdot v16.4s, v29.16b, v7.4b[1]\n"
- "add v22.4s, v22.4s, v30.4s\n"
+ ".inst 0x4fa5eb9b // sdot v27.4s, v28.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e394 // sdot v20.4s, v28.16b, v6.4b[1]\n"
+ "add v29.4s, v29.4s, v16.4s\n"
"add %x[params], %x[params], #0x50\n"
- "add v21.4s, v26.4s, v31.4s\n"
- "add v20.4s, v25.4s, v19.4s\n"
- "add v19.4s, v27.4s, v18.4s\n"
- "add v18.4s, v16.4s, v24.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f87eb90 // sdot v16.4s, v28.16b, v7.4b[2]\n"
- ".inst 0x4fa7ebb0 // sdot v16.4s, v29.16b, v7.4b[3]\n"
- "add v17.4s, v16.4s, v17.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f80e390 // sdot v16.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x4fa0e3b0 // sdot v16.4s, v29.16b, v0.4b[1]\n"
- "add v24.4s, v22.4s, v16.4s\n"
- "add v26.4s, v22.4s, v25.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x4fa0ebb0 // sdot v16.4s, v29.16b, v0.4b[3]\n"
- "add v25.4s, v21.4s, v16.4s\n"
- "add v27.4s, v21.4s, v27.4s\n"
- "add v28.4s, v20.4s, v30.4s\n"
- "add v29.4s, v19.4s, v31.4s\n"
- "add v30.4s, v18.4s, v20.4s\n"
- "add v31.4s, v17.4s, v19.4s\n"
+ ".inst 0x4fa6eb93 // sdot v19.4s, v28.16b, v6.4b[3]\n"
+ ".inst 0x4fa7e392 // sdot v18.4s, v28.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v31.4s\n"
+ ".inst 0x4fa7eb99 // sdot v25.4s, v28.16b, v7.4b[3]\n"
+ ".inst 0x4fa0e398 // sdot v24.4s, v28.16b, v0.4b[1]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v20.4s, v26.4s, v20.4s\n"
+ "add v19.4s, v27.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x4fa0eb91 // sdot v17.4s, v28.16b, v0.4b[3]\n"
+ "add v16.4s, v25.4s, v16.4s\n"
+ "add v24.4s, v22.4s, v24.4s\n"
+ "add v25.4s, v21.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v20.4s, v31.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v18.4s\n"
+ "add v31.4s, v19.4s, v16.4s\n"
"neg v23.4s, v23.4s\n"
"mul v24.4s, v24.4s, v23.4s\n"
"mul v25.4s, v25.4s, v23.4s\n"
@@ -194,11 +195,11 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add v31.4s, v31.4s, v12.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x60]\n"
- "ldr q20, [%x[params], #0x70]\n"
+ "ldr q12, [%x[params], #0x60]\n"
+ "ldr q21, [%x[params], #0x70]\n"
".inst 0x4f80e118 // sdot v24.4s, v8.16b, v0.4b[0]\n"
".inst 0x4f80e919 // sdot v25.4s, v8.16b, v0.4b[2]\n"
- "ldr q12, [%x[params], #0x80]\n"
+ "ldr q20, [%x[params], #0x80]\n"
".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -212,7 +213,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
@@ -221,7 +222,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
@@ -230,115 +231,115 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e11a // sdot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e91b // sdot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4fa2e939 // sdot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa3e13a // sdot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x4fa3e93b // sdot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x4f84e11c // sdot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e91d // sdot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11e // sdot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91f // sdot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x4f83e158 // sdot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f83e959 // sdot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f84e15a // sdot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f84e95b // sdot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4fa4e13c // sdot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e93d // sdot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x4fa5e13e // sdot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93f // sdot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
- ".inst 0x4fa3e178 // sdot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x4fa3e979 // sdot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x4fa4e17a // sdot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x4fa4e97b // sdot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x4f85e15c // sdot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e95d // sdot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e15e // sdot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e95f // sdot v31.4s, v10.16b, v6.4b[2]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x4f84e118 // sdot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11a // sdot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4fa5e17c // sdot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e97d // sdot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x4fa6e17e // sdot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e97f // sdot v31.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
"ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x4fa4e138 // sdot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x4fa5e13a // sdot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x4f86e11c // sdot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e91d // sdot v29.4s, v8.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x4f87e11e // sdot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e91f // sdot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ ".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
"ldr q8, [%x[params], #0x90]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x4fa6e13c // sdot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e93d // sdot v29.4s, v9.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x4fa7e13e // sdot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e93f // sdot v31.4s, v9.16b, v7.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
- "and v18.16b, v25.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0xd0\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "add %x[params], %x[params], #0xd0\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
- "and v17.16b, v26.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v19.4s\n"
- "and v16.16b, v27.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v19.4s\n"
- "and v16.16b, v31.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -347,14 +348,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -388,14 +389,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x60]\n"
@@ -420,7 +421,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add x21, x21, x28\n"
".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
"add x20, x20, x28\n"
".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
@@ -430,7 +431,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
@@ -439,68 +440,68 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e11a // sdot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e91b // sdot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4fa2e939 // sdot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa3e13a // sdot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x4fa3e93b // sdot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x4f84e11c // sdot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e91d // sdot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11e // sdot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91f // sdot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x4f83e158 // sdot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f83e959 // sdot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f84e15a // sdot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f84e95b // sdot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4fa4e13c // sdot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e93d // sdot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x4fa5e13e // sdot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93f // sdot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x4fa3e178 // sdot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x4fa3e979 // sdot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x4fa4e17a // sdot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x4fa4e97b // sdot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x4f85e15c // sdot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e95d // sdot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e15e // sdot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e95f // sdot v31.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f84e118 // sdot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f85e11a // sdot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4fa5e17c // sdot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e97d // sdot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x4fa6e17e // sdot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e97f // sdot v31.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x4fa4e138 // sdot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
"sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x4fa5e13a // sdot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
"sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x4f86e11c // sdot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e91d // sdot v29.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x4f87e11e // sdot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e91f // sdot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x4fa6e13c // sdot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e93d // sdot v29.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
"and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x4fa7e13e // sdot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e93f // sdot v31.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
@@ -536,14 +537,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -552,14 +553,14 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -635,4 +636,5 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 3dad8d5604..3f71c5fb64 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 3a544e0697..b21ad484e5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -47,21 +48,21 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
__asm__ __volatile__(
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v3.16b }, [x20]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v12.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v10.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
@@ -89,256 +90,256 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_mul_ptr], 3f\n"
"lsl x20, x9, #0x2\n"
"ldr q9, [%x[rq_mul_ptr], x20]\n"
- "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 7f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -347,263 +348,263 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"6:" // Output channel loop: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldr d7, [x28, #0x0]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -612,224 +613,224 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"7:" // Output channel loop: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -838,62 +839,62 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
"cmp x9, x10, LSL #2\n"
@@ -936,354 +937,354 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 22f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "ssubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ssubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "ssubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v16.4s, v16.4s, v15.4s\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -1317,158 +1318,156 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[2], [x20]\n"
- "st1 { v25.b }[2], [x21]\n"
- "st1 { v26.b }[2], [x22]\n"
- "st1 { v27.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x24]\n"
- "st1 { v29.b }[2], [x25]\n"
- "st1 { v30.b }[2], [x26]\n"
- "st1 { v31.b }[2], [x27]\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[0], [x20]\n"
- "st1 { v25.b }[0], [x21]\n"
- "st1 { v26.b }[0], [x22]\n"
- "st1 { v27.b }[0], [x23]\n"
- "st1 { v28.b }[0], [x24]\n"
- "st1 { v29.b }[0], [x25]\n"
- "st1 { v30.b }[0], [x26]\n"
- "st1 { v31.b }[0], [x27]\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
-
"26:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -1477,4 +1476,5 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 1d45804714..fc83aaf5d2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,14 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int,
- const int8_t *const *const,
- const int8_t *,
- const int32_t *,
- const arm_gemm::Requantize32&,
- const int32_t *, const int32_t *,
- int8_t *const *const);
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 3fc1b13d9c..aad34c4c25 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,716 +30,708 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
"lsr x15, %x[n_channels], #0x4\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v9.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"ldp x14, x13, [%x[inptrs], #0x0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v8.4s }, [x20]\n"
+ "ld1r { v12.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v7.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"mov x12, #0x0\n"
"mov x11, #0x0\n"
"ldp x10, x9, [%x[inptrs], #0x10]\n"
"ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
"cbz x15, 3f\n"
- "ldr q6, [x14, x12]\n"
- "ldr q5, [x13, x12]\n"
+ "ldr q11, [x14, x12]\n"
+ "ldr q20, [x13, x12]\n"
"subs x15, x15, #0x1\n"
- "ldr q4, [x10, x12]\n"
- "ldr q3, [x9, x12]\n"
- "zip2 v2.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "ldr q1, [x28, x12]\n"
- "ldr q0, [x27, x12]\n"
- "zip1 v4.16b, v5.16b, v3.16b\n"
- "zip2 v3.16b, v5.16b, v3.16b\n"
- "ldr q31, [x26, x12]\n"
- "ldr q30, [x25, x12]\n"
- "zip2 v5.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "ldr q28, [%x[params], #0x20]\n"
- "zip1 v4.16b, v2.16b, v3.16b\n"
- "zip2 v3.16b, v2.16b, v3.16b\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "zip2 v26.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "ldp x14, x13, [%x[inptrs], #0x40]\n"
- "ldr q25, [x14, x12]\n"
- "zip1 v31.16b, v0.16b, v30.16b\n"
- "zip2 v30.16b, v0.16b, v30.16b\n"
- "ldr q24, [x13, x12]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip2 v0.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "ldr q23, [x10, x12]\n"
- "ldr q22, [x9, x12]\n"
- "zip2 v21.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldr q20, [x28, x12]\n"
- "zip1 v23.16b, v24.16b, v22.16b\n"
- "zip2 v22.16b, v24.16b, v22.16b\n"
- "ldr q19, [x27, x12]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip1 v31.16b, v26.16b, v30.16b\n"
- "zip2 v30.16b, v26.16b, v30.16b\n"
- "ldr q18, [x26, x12]\n"
- "ldr q17, [x25, x12]\n"
- "zip2 v16.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v19.16b, v17.16b\n"
- "zip2 v17.16b, v19.16b, v17.16b\n"
+ "ldr q16, [x10, x12]\n"
+ "ldr q14, [x9, x12]\n"
+ "zip2 v19.16b, v11.16b, v16.16b\n"
+ "zip1 v11.16b, v11.16b, v16.16b\n"
+ "ldr q13, [x28, x12]\n"
+ "ldr q18, [x27, x12]\n"
+ "zip1 v17.16b, v20.16b, v14.16b\n"
+ "zip2 v14.16b, v20.16b, v14.16b\n"
+ "ldr q16, [x26, x12]\n"
+ "ldr q27, [x21, x12]\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "ldr q24, [%x[params], #0x10]\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "zip1 v3.16b, v19.16b, v14.16b\n"
+ "zip2 v14.16b, v19.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "zip2 v30.16b, v13.16b, v16.16b\n"
+ "zip1 v13.16b, v13.16b, v16.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q5, [x21, x12]\n"
+ "zip1 v16.16b, v18.16b, v27.16b\n"
+ "zip2 v27.16b, v18.16b, v27.16b\n"
+ "ldr q17, [x20, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v28.16b, v13.16b, v16.16b\n"
+ "zip1 v13.16b, v13.16b, v16.16b\n"
+ "ldr q16, [x21, x12]\n"
+ "ldr q7, [x20, x12]\n"
+ "zip2 v20.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q16, [x21, x12]\n"
+ "zip1 v22.16b, v17.16b, v7.16b\n"
+ "zip2 v7.16b, v17.16b, v7.16b\n"
+ "ldr q19, [x20, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v21.16b, v30.16b, v27.16b\n"
+ "zip2 v27.16b, v30.16b, v27.16b\n"
+ "ldr q30, [x21, x12]\n"
+ "ldr q1, [x20, x12]\n"
+ "zip2 v17.16b, v16.16b, v30.16b\n"
+ "zip1 v16.16b, v16.16b, v30.16b\n"
+ "zip1 v18.16b, v19.16b, v1.16b\n"
+ "zip2 v1.16b, v19.16b, v1.16b\n"
"ldp x14, x13, [%x[inptrs], #0x0]\n"
"ldp x10, x9, [%x[inptrs], #0x10]\n"
"ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 v24.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v21.16b, v22.16b\n"
- "zip2 v22.16b, v21.16b, v22.16b\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v29.16b, v5.16b, v22.16b\n"
+ "zip1 v5.16b, v5.16b, v22.16b\n"
+ "zip1 v0.16b, v20.16b, v7.16b\n"
+ "zip2 v7.16b, v20.16b, v7.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v19.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v16.16b, v17.16b\n"
- "zip2 v17.16b, v16.16b, v17.16b\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
+ "zip2 v30.16b, v16.16b, v18.16b\n"
+ "zip1 v16.16b, v16.16b, v18.16b\n"
+ "zip1 v2.16b, v17.16b, v1.16b\n"
+ "zip2 v1.16b, v17.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v4.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
+ ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
"add x12, x12, #0x10\n"
- ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
- "ldr q6, [%x[params], #0x0]\n"
- ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
- ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
"subs x15, x15, #0x1\n"
- ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
- "ldr q1, [%x[params], #0x10]\n"
- ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
- ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
- ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x60]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x40]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x50]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x30]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x70]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "ldr q5, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "ldr q13, [%x[params], #0x40]\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x50]\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x20]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s31, [x25, x11]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
- ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
- ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s18, [x23, x11]\n"
+ "mov v26.16b, v24.16b\n"
+ "str s4, [x22, x11]\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v23.16b, v24.16b\n"
+ ".inst 0x4e8a9618 // sdot v24.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c95b8 // sdot v24.4s, v13.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
"add x11, x11, #0x4\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
- "ldr q5, [x13, x12]\n"
- ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
- ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
- ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
- "ldr q0, [x27, x12]\n"
- ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
- ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0xc0]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0xa0]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0xb0]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x90]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x80]\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a961a // sdot v26.4s, v16.16b, v10.16b\n"
+ "ldr q10, [x13, x12]\n"
+ ".inst 0x4e9c9617 // sdot v23.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9d95b9 // sdot v25.4s, v13.16b, v29.16b\n"
+ ".inst 0x4e9d9638 // sdot v24.4s, v17.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c95ba // sdot v26.4s, v13.16b, v28.16b\n"
+ "ldr q20, [x27, x12]\n"
+ ".inst 0x4e9d95b7 // sdot v23.4s, v13.16b, v29.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v5.4s\n"
+ ".inst 0x4e9e9639 // sdot v25.4s, v17.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e9d963a // sdot v26.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9e9637 // sdot v23.4s, v17.16b, v30.16b\n"
+ "and v16.16b, v24.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v5.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v5.4s\n"
+ "ldr q19, [%x[params], #0xc0]\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v18.16b, v26.16b, v22.16b\n"
+ "and v17.16b, v25.16b, v22.16b\n"
+ "and v16.16b, v23.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xb0]\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "add v24.4s, v24.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "srshl v23.4s, v23.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x11]\n"
+ "ldr q24, [%x[params], #0x80]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
- ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s25, [x23, x11]\n"
+ "str s23, [x22, x11]\n"
+ "mov v23.16b, v24.16b\n"
+ "mov v31.16b, v24.16b\n"
+ ".inst 0x4e95961f // sdot v31.4s, v16.16b, v21.16b\n"
+ "mov v13.16b, v24.16b\n"
+ ".inst 0x4e839618 // sdot v24.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e959658 // sdot v24.4s, v18.16b, v21.16b\n"
"add x11, x11, #0x4\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
- "ldr q4, [x10, x12]\n"
- ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
- ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
- ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
- "ldr q31, [x26, x12]\n"
- ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
- ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
- ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x120]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x100]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x110]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0xf0]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x130]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e839617 // sdot v23.4s, v16.16b, v3.16b\n"
+ "ldr q3, [x10, x12]\n"
+ ".inst 0x4e95960d // sdot v13.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e80965f // sdot v31.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e809638 // sdot v24.4s, v17.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e959657 // sdot v23.4s, v18.16b, v21.16b\n"
+ "ldr q4, [x26, x12]\n"
+ ".inst 0x4e80964d // sdot v13.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+ ".inst 0x4e809637 // sdot v23.4s, v17.16b, v0.16b\n"
+ ".inst 0x4e82962d // sdot v13.4s, v17.16b, v2.16b\n"
+ "and v16.16b, v24.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v19.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+ "ldr q19, [%x[params], #0x120]\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v18.16b, v23.16b, v22.16b\n"
+ "and v17.16b, v31.16b, v22.16b\n"
+ "and v16.16b, v13.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x100]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v13.4s, v13.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "add v24.4s, v24.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v22.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0x130]\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v12.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v13.4s, v13.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s24, [x25, x11]\n"
"ldr q2, [%x[params], #0xe0]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s23, [x24, x11]\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str s31, [x23, x11]\n"
+ "mov v25.16b, v2.16b\n"
+ "str s13, [x22, x11]\n"
"mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
- ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "mov v30.16b, v2.16b\n"
+ ".inst 0x4e8e9602 // sdot v2.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9615 // sdot v21.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b9642 // sdot v2.4s, v18.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
"add x11, x11, #0x4\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
- "ldr q3, [x9, x12]\n"
- ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
- ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
- ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
- "ldr q30, [x25, x12]\n"
- ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
- ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [x14, x12]\n"
- "ldp x14, x13, [%x[inptrs], #0x40]\n"
- "ldr q25, [x14, x12]\n"
- "ldr q24, [x13, x12]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "ldr q23, [x10, x12]\n"
- "ldr q22, [x9, x12]\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x160]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x170]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x150]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [x28, x12]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldr q20, [x28, x12]\n"
- "ldr q19, [x27, x12]\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "ldr q18, [x26, x12]\n"
- "ldr q17, [x25, x12]\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e9619 // sdot v25.4s, v16.16b, v14.16b\n"
+ "ldr q14, [x9, x12]\n"
+ ".inst 0x4e9b961e // sdot v30.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e879655 // sdot v21.4s, v18.16b, v7.16b\n"
+ ".inst 0x4e879622 // sdot v2.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b9659 // sdot v25.4s, v18.16b, v27.16b\n"
+ "ldr q27, [x21, x12]\n"
+ ".inst 0x4e87965e // sdot v30.4s, v18.16b, v7.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v19.4s\n"
+ ".inst 0x4e819635 // sdot v21.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e879639 // sdot v25.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e81963e // sdot v30.4s, v17.16b, v1.16b\n"
+ "and v16.16b, v2.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v25.4s, v25.4s, v19.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v19.4s\n"
+ "ldr q11, [x14, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q5, [x21, x12]\n"
+ "ldr q29, [x20, x12]\n"
+ "sqadd v2.4s, v2.4s, v16.4s\n"
+ "and v19.16b, v25.16b, v22.16b\n"
+ "and v17.16b, v21.16b, v22.16b\n"
+ "and v16.16b, v30.16b, v22.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q26, [x21, x12]\n"
+ "ldr q7, [x20, x12]\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v22.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "ldr q9, [%x[params], #0x160]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x170]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q24, [%x[params], #0x150]\n"
+ "add v2.4s, v2.4s, v15.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "ldr q13, [x28, x12]\n"
+ "smax v2.4s, v2.4s, v8.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q16, [x21, x12]\n"
+ "ldr q28, [x20, x12]\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "add v30.4s, v30.4s, v15.4s\n"
+ "smin v2.4s, v2.4s, v12.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x12]\n"
+ "ldr q1, [x20, x12]\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v21.4s, v21.4s, v8.4s\n"
"ldp x14, x13, [%x[inptrs], #0x0]\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"ldp x10, x9, [%x[inptrs], #0x10]\n"
"ldp x28, x27, [%x[inptrs], #0x20]\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x24, x11]\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x25, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "zip2 v2.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "zip1 v4.16b, v5.16b, v3.16b\n"
- "zip2 v3.16b, v5.16b, v3.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x23, x11]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "zip2 v18.16b, v11.16b, v3.16b\n"
+ "zip1 v11.16b, v11.16b, v3.16b\n"
+ "zip1 v17.16b, v10.16b, v14.16b\n"
+ "zip2 v14.16b, v10.16b, v14.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x24, x11]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
- "zip2 v5.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s21, [x23, x11]\n"
+ "str s30, [x22, x11]\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
"add x11, x11, #0x4\n"
- "zip1 v4.16b, v2.16b, v3.16b\n"
- "zip2 v3.16b, v2.16b, v3.16b\n"
- "ldr q2, [%x[params], #0x140]\n"
+ "zip1 v3.16b, v18.16b, v14.16b\n"
+ "zip2 v14.16b, v18.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
"add %x[params], %x[params], #0x180\n"
- "zip2 v26.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v0.16b, v30.16b\n"
- "zip2 v30.16b, v0.16b, v30.16b\n"
- "zip2 v21.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v24.16b, v22.16b\n"
- "zip2 v22.16b, v24.16b, v22.16b\n"
- "zip2 v16.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v19.16b, v17.16b\n"
- "zip2 v17.16b, v19.16b, v17.16b\n"
- "zip2 v0.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v26.16b, v30.16b\n"
- "zip2 v30.16b, v26.16b, v30.16b\n"
- "zip2 v24.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v21.16b, v22.16b\n"
- "zip2 v22.16b, v21.16b, v22.16b\n"
- "zip2 v19.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v16.16b, v17.16b\n"
- "zip2 v17.16b, v16.16b, v17.16b\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
+ "zip2 v22.16b, v13.16b, v4.16b\n"
+ "zip1 v13.16b, v13.16b, v4.16b\n"
+ "zip1 v2.16b, v20.16b, v27.16b\n"
+ "zip2 v27.16b, v20.16b, v27.16b\n"
+ "zip2 v19.16b, v5.16b, v26.16b\n"
+ "zip1 v5.16b, v5.16b, v26.16b\n"
+ "zip1 v18.16b, v29.16b, v7.16b\n"
+ "zip2 v7.16b, v29.16b, v7.16b\n"
+ "zip2 v4.16b, v16.16b, v23.16b\n"
+ "zip1 v16.16b, v16.16b, v23.16b\n"
+ "zip1 v17.16b, v28.16b, v1.16b\n"
+ "zip2 v1.16b, v28.16b, v1.16b\n"
+ "zip2 v28.16b, v13.16b, v2.16b\n"
+ "zip1 v13.16b, v13.16b, v2.16b\n"
+ "zip1 v21.16b, v22.16b, v27.16b\n"
+ "zip2 v27.16b, v22.16b, v27.16b\n"
+ "zip2 v29.16b, v5.16b, v18.16b\n"
+ "zip1 v5.16b, v5.16b, v18.16b\n"
+ "zip1 v0.16b, v19.16b, v7.16b\n"
+ "zip2 v7.16b, v19.16b, v7.16b\n"
+ "zip2 v30.16b, v16.16b, v17.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "zip1 v2.16b, v4.16b, v1.16b\n"
+ "zip2 v1.16b, v4.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v4.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
+ ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
- "ldr q6, [%x[params], #0x0]\n"
- ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
- ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
"add x12, x12, #0x10\n"
- ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
- "ldr q1, [%x[params], #0x10]\n"
- ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
- ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
- ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x60]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x40]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x50]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x30]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x70]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x20]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
- ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
- ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
"ext v5.16b, v5.16b, v5.16b, #0x1\n"
- "add x11, x11, #0x4\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
- ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
- ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
- ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
- ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
- ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0xc0]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0xa0]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0xb0]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0x90]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0x80]\n"
+ ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ "ldr q19, [%x[params], #0x10]\n"
+ ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v19.16b\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "and v16.16b, v4.16b, v19.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q5, [%x[params], #0x40]\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x50]\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v4.4s, v4.4s, v19.4s\n"
+ "ldr q23, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s31, [x25, x11]\n"
+ "ldr q25, [%x[params], #0x20]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
- ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s18, [x23, x11]\n"
+ "mov v22.16b, v25.16b\n"
+ "str s4, [x22, x11]\n"
+ "mov v20.16b, v25.16b\n"
+ "mov v19.16b, v25.16b\n"
+ ".inst 0x4e8a9619 // sdot v25.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c94b9 // sdot v25.4s, v5.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a9616 // sdot v22.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9613 // sdot v19.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9d94b4 // sdot v20.4s, v5.16b, v29.16b\n"
+ ".inst 0x4e9d9639 // sdot v25.4s, v17.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c94b6 // sdot v22.4s, v5.16b, v28.16b\n"
+ ".inst 0x4e9d94b3 // sdot v19.4s, v5.16b, v29.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v24.4s\n"
+ ".inst 0x4e9e9634 // sdot v20.4s, v17.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e9d9636 // sdot v22.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9e9633 // sdot v19.4s, v17.16b, v30.16b\n"
+ "and v16.16b, v25.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "ldr q24, [%x[params], #0xc0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v18.16b, v22.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "sqadd v22.4s, v22.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xa0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xb0]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0xd0]\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x25, x11]\n"
+ "ldr q10, [%x[params], #0x80]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x23, x11]\n"
+ "str s19, [x22, x11]\n"
+ "mov v28.16b, v10.16b\n"
+ "mov v20.16b, v10.16b\n"
+ ".inst 0x4e959614 // sdot v20.4s, v16.16b, v21.16b\n"
+ "mov v19.16b, v10.16b\n"
+ ".inst 0x4e83960a // sdot v10.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e95964a // sdot v10.4s, v18.16b, v21.16b\n"
"add x11, x11, #0x4\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
- ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
- ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
- ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
- ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
- ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
- ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "ldr q6, [%x[params], #0x120]\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "ldr q28, [%x[params], #0x100]\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [%x[params], #0x110]\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "ldr q29, [%x[params], #0xf0]\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "ldr q1, [%x[params], #0x130]\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
- "ldr q2, [%x[params], #0xe0]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "add %x[params], %x[params], #0x140\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s26, [x23, x11]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s21, [x22, x11]\n"
- "mov v26.16b, v2.16b\n"
- "str s16, [x21, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
- ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
"ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e83961c // sdot v28.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e959613 // sdot v19.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e809654 // sdot v20.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e80962a // sdot v10.4s, v17.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e95965c // sdot v28.4s, v18.16b, v21.16b\n"
+ ".inst 0x4e809653 // sdot v19.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e829634 // sdot v20.4s, v17.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v10.4s, v10.4s, v24.4s\n"
+ ".inst 0x4e80963c // sdot v28.4s, v17.16b, v0.16b\n"
+ ".inst 0x4e829633 // sdot v19.4s, v17.16b, v2.16b\n"
+ "and v16.16b, v10.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "ldr q24, [%x[params], #0x120]\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "and v18.16b, v28.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v10.4s, v10.4s, v23.4s\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x100]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "add v10.4s, v10.4s, v15.4s\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v10.4s, v10.4s, v8.4s\n"
+ "add v28.4s, v28.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v28.4s, v28.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s10, [x25, x11]\n"
+ "ldr q22, [%x[params], #0xe0]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s28, [x24, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x23, x11]\n"
+ "mov v21.16b, v22.16b\n"
+ "str s19, [x22, x11]\n"
+ "mov v20.16b, v22.16b\n"
+ "mov v19.16b, v22.16b\n"
+ ".inst 0x4e8e9616 // sdot v22.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b9656 // sdot v22.4s, v18.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
"add x11, x11, #0x4\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
- ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
- ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
- ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
- ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e9615 // sdot v21.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9613 // sdot v19.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e879654 // sdot v20.4s, v18.16b, v7.16b\n"
+ ".inst 0x4e879636 // sdot v22.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b9655 // sdot v21.4s, v18.16b, v27.16b\n"
+ ".inst 0x4e879653 // sdot v19.4s, v18.16b, v7.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ ".inst 0x4e819634 // sdot v20.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e819633 // sdot v19.4s, v17.16b, v1.16b\n"
+ "and v16.16b, v22.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v18.16b, v21.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "add v22.4s, v22.4s, v15.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v8.4s\n"
+ "smax v21.4s, v21.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s2, [x24, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s22, [x25, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s21, [x24, x11]\n"
+ "str s20, [x23, x11]\n"
+ "str s19, [x22, x11]\n"
"add x11, x11, #0x4\n"
"beq 35f\n"
"3:" // Oddments
@@ -751,740 +743,738 @@ void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"add x28, x28, x12\n"
"add x27, x27, x12\n"
"add x26, x26, x12\n"
- "add x25, x25, x12\n"
+ "add x21, x21, x12\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d6, [x14], #0x8\n"
- "ldr d5, [x13], #0x8\n"
- "ldr d4, [x10], #0x8\n"
- "ldr d3, [x9], #0x8\n"
- "ldr d1, [x28], #0x8\n"
- "ldr d0, [x27], #0x8\n"
- "ldr d31, [x26], #0x8\n"
- "ldr d30, [x25], #0x8\n"
+ "ldr d11, [x14], #0x8\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d3, [x10], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d13, [x28], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d21, [x26], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v6.s }[2], [x14], #0x4\n"
- "ld1 { v5.s }[2], [x13], #0x4\n"
- "ld1 { v4.s }[2], [x10], #0x4\n"
- "ld1 { v3.s }[2], [x9], #0x4\n"
- "ld1 { v1.s }[2], [x28], #0x4\n"
- "ld1 { v0.s }[2], [x27], #0x4\n"
- "ld1 { v31.s }[2], [x26], #0x4\n"
- "ld1 { v30.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x14], #0x4\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v3.s }[2], [x10], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v13.s }[2], [x28], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v6.h }[6], [x14], #0x2\n"
- "ld1 { v5.h }[6], [x13], #0x2\n"
- "ld1 { v4.h }[6], [x10], #0x2\n"
- "ld1 { v3.h }[6], [x9], #0x2\n"
- "ld1 { v1.h }[6], [x28], #0x2\n"
- "ld1 { v0.h }[6], [x27], #0x2\n"
- "ld1 { v31.h }[6], [x26], #0x2\n"
- "ld1 { v30.h }[6], [x25], #0x2\n"
+ "ld1 { v11.h }[6], [x14], #0x2\n"
+ "ld1 { v10.h }[6], [x13], #0x2\n"
+ "ld1 { v3.h }[6], [x10], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v13.h }[6], [x28], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
+ "ld1 { v21.h }[6], [x26], #0x2\n"
+ "ld1 { v27.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[14], [x14], #0x1\n"
- "ld1 { v5.b }[14], [x13], #0x1\n"
- "ld1 { v4.b }[14], [x10], #0x1\n"
- "ld1 { v3.b }[14], [x9], #0x1\n"
- "ld1 { v1.b }[14], [x28], #0x1\n"
- "ld1 { v0.b }[14], [x27], #0x1\n"
- "ld1 { v31.b }[14], [x26], #0x1\n"
- "ld1 { v30.b }[14], [x25], #0x1\n"
+ "ld1 { v11.b }[14], [x14], #0x1\n"
+ "ld1 { v10.b }[14], [x13], #0x1\n"
+ "ld1 { v3.b }[14], [x10], #0x1\n"
+ "ld1 { v14.b }[14], [x9], #0x1\n"
+ "ld1 { v13.b }[14], [x28], #0x1\n"
+ "ld1 { v28.b }[14], [x27], #0x1\n"
+ "ld1 { v21.b }[14], [x26], #0x1\n"
+ "ld1 { v27.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[12], [x14], #0x1\n"
- "ld1 { v5.b }[12], [x13], #0x1\n"
- "ld1 { v4.b }[12], [x10], #0x1\n"
- "ld1 { v3.b }[12], [x9], #0x1\n"
- "ld1 { v1.b }[12], [x28], #0x1\n"
- "ld1 { v0.b }[12], [x27], #0x1\n"
- "ld1 { v31.b }[12], [x26], #0x1\n"
- "ld1 { v30.b }[12], [x25], #0x1\n"
+ "ld1 { v11.b }[12], [x14], #0x1\n"
+ "ld1 { v10.b }[12], [x13], #0x1\n"
+ "ld1 { v3.b }[12], [x10], #0x1\n"
+ "ld1 { v14.b }[12], [x9], #0x1\n"
+ "ld1 { v13.b }[12], [x28], #0x1\n"
+ "ld1 { v28.b }[12], [x27], #0x1\n"
+ "ld1 { v21.b }[12], [x26], #0x1\n"
+ "ld1 { v27.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v6.h }[4], [x14], #0x2\n"
- "ld1 { v5.h }[4], [x13], #0x2\n"
- "ld1 { v4.h }[4], [x10], #0x2\n"
- "ld1 { v3.h }[4], [x9], #0x2\n"
- "ld1 { v1.h }[4], [x28], #0x2\n"
- "ld1 { v0.h }[4], [x27], #0x2\n"
- "ld1 { v31.h }[4], [x26], #0x2\n"
- "ld1 { v30.h }[4], [x25], #0x2\n"
+ "ld1 { v11.h }[4], [x14], #0x2\n"
+ "ld1 { v10.h }[4], [x13], #0x2\n"
+ "ld1 { v3.h }[4], [x10], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v13.h }[4], [x28], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v21.h }[4], [x26], #0x2\n"
+ "ld1 { v27.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[10], [x14], #0x1\n"
- "ld1 { v5.b }[10], [x13], #0x1\n"
- "ld1 { v4.b }[10], [x10], #0x1\n"
- "ld1 { v3.b }[10], [x9], #0x1\n"
- "ld1 { v1.b }[10], [x28], #0x1\n"
- "ld1 { v0.b }[10], [x27], #0x1\n"
- "ld1 { v31.b }[10], [x26], #0x1\n"
- "ld1 { v30.b }[10], [x25], #0x1\n"
+ "ld1 { v11.b }[10], [x14], #0x1\n"
+ "ld1 { v10.b }[10], [x13], #0x1\n"
+ "ld1 { v3.b }[10], [x10], #0x1\n"
+ "ld1 { v14.b }[10], [x9], #0x1\n"
+ "ld1 { v13.b }[10], [x28], #0x1\n"
+ "ld1 { v28.b }[10], [x27], #0x1\n"
+ "ld1 { v21.b }[10], [x26], #0x1\n"
+ "ld1 { v27.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[8], [x14], #0x1\n"
- "ld1 { v5.b }[8], [x13], #0x1\n"
- "ld1 { v4.b }[8], [x10], #0x1\n"
- "ld1 { v3.b }[8], [x9], #0x1\n"
- "ld1 { v1.b }[8], [x28], #0x1\n"
- "ld1 { v0.b }[8], [x27], #0x1\n"
- "ld1 { v31.b }[8], [x26], #0x1\n"
- "ld1 { v30.b }[8], [x25], #0x1\n"
+ "ld1 { v11.b }[8], [x14], #0x1\n"
+ "ld1 { v10.b }[8], [x13], #0x1\n"
+ "ld1 { v3.b }[8], [x10], #0x1\n"
+ "ld1 { v14.b }[8], [x9], #0x1\n"
+ "ld1 { v13.b }[8], [x28], #0x1\n"
+ "ld1 { v28.b }[8], [x27], #0x1\n"
+ "ld1 { v21.b }[8], [x26], #0x1\n"
+ "ld1 { v27.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s6, [x14], #0x4\n"
- "ldr s5, [x13], #0x4\n"
- "ldr s4, [x10], #0x4\n"
- "ldr s3, [x9], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s0, [x27], #0x4\n"
- "ldr s31, [x26], #0x4\n"
- "ldr s30, [x25], #0x4\n"
+ "ldr s11, [x14], #0x4\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s3, [x10], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s13, [x28], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s21, [x26], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v6.h }[2], [x14], #0x2\n"
- "ld1 { v5.h }[2], [x13], #0x2\n"
- "ld1 { v4.h }[2], [x10], #0x2\n"
- "ld1 { v3.h }[2], [x9], #0x2\n"
- "ld1 { v1.h }[2], [x28], #0x2\n"
- "ld1 { v0.h }[2], [x27], #0x2\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x14], #0x2\n"
+ "ld1 { v10.h }[2], [x13], #0x2\n"
+ "ld1 { v3.h }[2], [x10], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v13.h }[2], [x28], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v21.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[6], [x14], #0x1\n"
- "ld1 { v5.b }[6], [x13], #0x1\n"
- "ld1 { v4.b }[6], [x10], #0x1\n"
- "ld1 { v3.b }[6], [x9], #0x1\n"
- "ld1 { v1.b }[6], [x28], #0x1\n"
- "ld1 { v0.b }[6], [x27], #0x1\n"
- "ld1 { v31.b }[6], [x26], #0x1\n"
- "ld1 { v30.b }[6], [x25], #0x1\n"
+ "ld1 { v11.b }[6], [x14], #0x1\n"
+ "ld1 { v10.b }[6], [x13], #0x1\n"
+ "ld1 { v3.b }[6], [x10], #0x1\n"
+ "ld1 { v14.b }[6], [x9], #0x1\n"
+ "ld1 { v13.b }[6], [x28], #0x1\n"
+ "ld1 { v28.b }[6], [x27], #0x1\n"
+ "ld1 { v21.b }[6], [x26], #0x1\n"
+ "ld1 { v27.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[4], [x14], #0x1\n"
- "ld1 { v5.b }[4], [x13], #0x1\n"
- "ld1 { v4.b }[4], [x10], #0x1\n"
- "ld1 { v3.b }[4], [x9], #0x1\n"
- "ld1 { v1.b }[4], [x28], #0x1\n"
- "ld1 { v0.b }[4], [x27], #0x1\n"
- "ld1 { v31.b }[4], [x26], #0x1\n"
- "ld1 { v30.b }[4], [x25], #0x1\n"
+ "ld1 { v11.b }[4], [x14], #0x1\n"
+ "ld1 { v10.b }[4], [x13], #0x1\n"
+ "ld1 { v3.b }[4], [x10], #0x1\n"
+ "ld1 { v14.b }[4], [x9], #0x1\n"
+ "ld1 { v13.b }[4], [x28], #0x1\n"
+ "ld1 { v28.b }[4], [x27], #0x1\n"
+ "ld1 { v21.b }[4], [x26], #0x1\n"
+ "ld1 { v27.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h6, [x14], #0x2\n"
- "ldr h5, [x13], #0x2\n"
- "ldr h4, [x10], #0x2\n"
- "ldr h3, [x9], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h0, [x27], #0x2\n"
- "ldr h31, [x26], #0x2\n"
- "ldr h30, [x25], #0x2\n"
+ "ldr h11, [x14], #0x2\n"
+ "ldr h10, [x13], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h13, [x28], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
+ "ldr h21, [x26], #0x2\n"
+ "ldr h27, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v6.b }[2], [x14], #0x1\n"
- "ld1 { v5.b }[2], [x13], #0x1\n"
- "ld1 { v4.b }[2], [x10], #0x1\n"
- "ld1 { v3.b }[2], [x9], #0x1\n"
- "ld1 { v1.b }[2], [x28], #0x1\n"
- "ld1 { v0.b }[2], [x27], #0x1\n"
- "ld1 { v31.b }[2], [x26], #0x1\n"
- "ld1 { v30.b }[2], [x25], #0x1\n"
+ "ld1 { v11.b }[2], [x14], #0x1\n"
+ "ld1 { v10.b }[2], [x13], #0x1\n"
+ "ld1 { v3.b }[2], [x10], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v13.b }[2], [x28], #0x1\n"
+ "ld1 { v28.b }[2], [x27], #0x1\n"
+ "ld1 { v21.b }[2], [x26], #0x1\n"
+ "ld1 { v27.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b6, [x14], #0x1\n"
- "ldr b5, [x13], #0x1\n"
- "ldr b4, [x10], #0x1\n"
- "ldr b3, [x9], #0x1\n"
- "ldr b1, [x28], #0x1\n"
- "ldr b0, [x27], #0x1\n"
- "ldr b31, [x26], #0x1\n"
- "ldr b30, [x25], #0x1\n"
+ "ldr b11, [x14], #0x1\n"
+ "ldr b10, [x13], #0x1\n"
+ "ldr b3, [x10], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b13, [x28], #0x1\n"
+ "ldr b28, [x27], #0x1\n"
+ "ldr b21, [x26], #0x1\n"
+ "ldr b27, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x14, x13, [%x[inptrs], #0x40]\n"
"ldp x10, x9, [%x[inptrs], #0x50]\n"
"add x14, x14, x12\n"
"add x13, x13, x12\n"
"ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
"add x10, x10, x12\n"
"add x9, x9, x12\n"
"add x28, x28, x12\n"
"add x27, x27, x12\n"
"add x26, x26, x12\n"
- "add x25, x25, x12\n"
+ "add x21, x21, x12\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d25, [x14], #0x8\n"
- "ldr d24, [x13], #0x8\n"
- "ldr d23, [x10], #0x8\n"
- "ldr d22, [x9], #0x8\n"
- "ldr d20, [x28], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d18, [x26], #0x8\n"
- "ldr d17, [x25], #0x8\n"
+ "ldr d5, [x14], #0x8\n"
+ "ldr d29, [x13], #0x8\n"
+ "ldr d0, [x10], #0x8\n"
+ "ldr d7, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d1, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v25.s }[2], [x14], #0x4\n"
- "ld1 { v24.s }[2], [x13], #0x4\n"
- "ld1 { v23.s }[2], [x10], #0x4\n"
- "ld1 { v22.s }[2], [x9], #0x4\n"
- "ld1 { v20.s }[2], [x28], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
- "ld1 { v18.s }[2], [x26], #0x4\n"
- "ld1 { v17.s }[2], [x25], #0x4\n"
+ "ld1 { v5.s }[2], [x14], #0x4\n"
+ "ld1 { v29.s }[2], [x13], #0x4\n"
+ "ld1 { v0.s }[2], [x10], #0x4\n"
+ "ld1 { v7.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v25.h }[6], [x14], #0x2\n"
- "ld1 { v24.h }[6], [x13], #0x2\n"
- "ld1 { v23.h }[6], [x10], #0x2\n"
- "ld1 { v22.h }[6], [x9], #0x2\n"
- "ld1 { v20.h }[6], [x28], #0x2\n"
- "ld1 { v19.h }[6], [x27], #0x2\n"
- "ld1 { v18.h }[6], [x26], #0x2\n"
- "ld1 { v17.h }[6], [x25], #0x2\n"
+ "ld1 { v5.h }[6], [x14], #0x2\n"
+ "ld1 { v29.h }[6], [x13], #0x2\n"
+ "ld1 { v0.h }[6], [x10], #0x2\n"
+ "ld1 { v7.h }[6], [x9], #0x2\n"
+ "ld1 { v16.h }[6], [x28], #0x2\n"
+ "ld1 { v30.h }[6], [x27], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[14], [x14], #0x1\n"
- "ld1 { v24.b }[14], [x13], #0x1\n"
- "ld1 { v23.b }[14], [x10], #0x1\n"
- "ld1 { v22.b }[14], [x9], #0x1\n"
- "ld1 { v20.b }[14], [x28], #0x1\n"
- "ld1 { v19.b }[14], [x27], #0x1\n"
- "ld1 { v18.b }[14], [x26], #0x1\n"
- "ld1 { v17.b }[14], [x25], #0x1\n"
+ "ld1 { v5.b }[14], [x14], #0x1\n"
+ "ld1 { v29.b }[14], [x13], #0x1\n"
+ "ld1 { v0.b }[14], [x10], #0x1\n"
+ "ld1 { v7.b }[14], [x9], #0x1\n"
+ "ld1 { v16.b }[14], [x28], #0x1\n"
+ "ld1 { v30.b }[14], [x27], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[12], [x14], #0x1\n"
- "ld1 { v24.b }[12], [x13], #0x1\n"
- "ld1 { v23.b }[12], [x10], #0x1\n"
- "ld1 { v22.b }[12], [x9], #0x1\n"
- "ld1 { v20.b }[12], [x28], #0x1\n"
- "ld1 { v19.b }[12], [x27], #0x1\n"
- "ld1 { v18.b }[12], [x26], #0x1\n"
- "ld1 { v17.b }[12], [x25], #0x1\n"
+ "ld1 { v5.b }[12], [x14], #0x1\n"
+ "ld1 { v29.b }[12], [x13], #0x1\n"
+ "ld1 { v0.b }[12], [x10], #0x1\n"
+ "ld1 { v7.b }[12], [x9], #0x1\n"
+ "ld1 { v16.b }[12], [x28], #0x1\n"
+ "ld1 { v30.b }[12], [x27], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v25.h }[4], [x14], #0x2\n"
- "ld1 { v24.h }[4], [x13], #0x2\n"
- "ld1 { v23.h }[4], [x10], #0x2\n"
- "ld1 { v22.h }[4], [x9], #0x2\n"
- "ld1 { v20.h }[4], [x28], #0x2\n"
- "ld1 { v19.h }[4], [x27], #0x2\n"
- "ld1 { v18.h }[4], [x26], #0x2\n"
- "ld1 { v17.h }[4], [x25], #0x2\n"
+ "ld1 { v5.h }[4], [x14], #0x2\n"
+ "ld1 { v29.h }[4], [x13], #0x2\n"
+ "ld1 { v0.h }[4], [x10], #0x2\n"
+ "ld1 { v7.h }[4], [x9], #0x2\n"
+ "ld1 { v16.h }[4], [x28], #0x2\n"
+ "ld1 { v30.h }[4], [x27], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[10], [x14], #0x1\n"
- "ld1 { v24.b }[10], [x13], #0x1\n"
- "ld1 { v23.b }[10], [x10], #0x1\n"
- "ld1 { v22.b }[10], [x9], #0x1\n"
- "ld1 { v20.b }[10], [x28], #0x1\n"
- "ld1 { v19.b }[10], [x27], #0x1\n"
- "ld1 { v18.b }[10], [x26], #0x1\n"
- "ld1 { v17.b }[10], [x25], #0x1\n"
+ "ld1 { v5.b }[10], [x14], #0x1\n"
+ "ld1 { v29.b }[10], [x13], #0x1\n"
+ "ld1 { v0.b }[10], [x10], #0x1\n"
+ "ld1 { v7.b }[10], [x9], #0x1\n"
+ "ld1 { v16.b }[10], [x28], #0x1\n"
+ "ld1 { v30.b }[10], [x27], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[8], [x14], #0x1\n"
- "ld1 { v24.b }[8], [x13], #0x1\n"
- "ld1 { v23.b }[8], [x10], #0x1\n"
- "ld1 { v22.b }[8], [x9], #0x1\n"
- "ld1 { v20.b }[8], [x28], #0x1\n"
- "ld1 { v19.b }[8], [x27], #0x1\n"
- "ld1 { v18.b }[8], [x26], #0x1\n"
- "ld1 { v17.b }[8], [x25], #0x1\n"
+ "ld1 { v5.b }[8], [x14], #0x1\n"
+ "ld1 { v29.b }[8], [x13], #0x1\n"
+ "ld1 { v0.b }[8], [x10], #0x1\n"
+ "ld1 { v7.b }[8], [x9], #0x1\n"
+ "ld1 { v16.b }[8], [x28], #0x1\n"
+ "ld1 { v30.b }[8], [x27], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s25, [x14], #0x4\n"
- "ldr s24, [x13], #0x4\n"
- "ldr s23, [x10], #0x4\n"
- "ldr s22, [x9], #0x4\n"
- "ldr s20, [x28], #0x4\n"
- "ldr s19, [x27], #0x4\n"
- "ldr s18, [x26], #0x4\n"
- "ldr s17, [x25], #0x4\n"
+ "ldr s5, [x14], #0x4\n"
+ "ldr s29, [x13], #0x4\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s7, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s1, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v25.h }[2], [x14], #0x2\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "ld1 { v23.h }[2], [x10], #0x2\n"
- "ld1 { v22.h }[2], [x9], #0x2\n"
- "ld1 { v20.h }[2], [x28], #0x2\n"
- "ld1 { v19.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v17.h }[2], [x25], #0x2\n"
+ "ld1 { v5.h }[2], [x14], #0x2\n"
+ "ld1 { v29.h }[2], [x13], #0x2\n"
+ "ld1 { v0.h }[2], [x10], #0x2\n"
+ "ld1 { v7.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[6], [x14], #0x1\n"
- "ld1 { v24.b }[6], [x13], #0x1\n"
- "ld1 { v23.b }[6], [x10], #0x1\n"
- "ld1 { v22.b }[6], [x9], #0x1\n"
- "ld1 { v20.b }[6], [x28], #0x1\n"
- "ld1 { v19.b }[6], [x27], #0x1\n"
- "ld1 { v18.b }[6], [x26], #0x1\n"
- "ld1 { v17.b }[6], [x25], #0x1\n"
+ "ld1 { v5.b }[6], [x14], #0x1\n"
+ "ld1 { v29.b }[6], [x13], #0x1\n"
+ "ld1 { v0.b }[6], [x10], #0x1\n"
+ "ld1 { v7.b }[6], [x9], #0x1\n"
+ "ld1 { v16.b }[6], [x28], #0x1\n"
+ "ld1 { v30.b }[6], [x27], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[4], [x14], #0x1\n"
- "ld1 { v24.b }[4], [x13], #0x1\n"
- "ld1 { v23.b }[4], [x10], #0x1\n"
- "ld1 { v22.b }[4], [x9], #0x1\n"
- "ld1 { v20.b }[4], [x28], #0x1\n"
- "ld1 { v19.b }[4], [x27], #0x1\n"
- "ld1 { v18.b }[4], [x26], #0x1\n"
- "ld1 { v17.b }[4], [x25], #0x1\n"
+ "ld1 { v5.b }[4], [x14], #0x1\n"
+ "ld1 { v29.b }[4], [x13], #0x1\n"
+ "ld1 { v0.b }[4], [x10], #0x1\n"
+ "ld1 { v7.b }[4], [x9], #0x1\n"
+ "ld1 { v16.b }[4], [x28], #0x1\n"
+ "ld1 { v30.b }[4], [x27], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h25, [x14], #0x2\n"
- "ldr h24, [x13], #0x2\n"
- "ldr h23, [x10], #0x2\n"
- "ldr h22, [x9], #0x2\n"
- "ldr h20, [x28], #0x2\n"
- "ldr h19, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h17, [x25], #0x2\n"
+ "ldr h5, [x14], #0x2\n"
+ "ldr h29, [x13], #0x2\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h7, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h1, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v25.b }[2], [x14], #0x1\n"
- "ld1 { v24.b }[2], [x13], #0x1\n"
- "ld1 { v23.b }[2], [x10], #0x1\n"
- "ld1 { v22.b }[2], [x9], #0x1\n"
- "ld1 { v20.b }[2], [x28], #0x1\n"
- "ld1 { v19.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v17.b }[2], [x25], #0x1\n"
+ "ld1 { v5.b }[2], [x14], #0x1\n"
+ "ld1 { v29.b }[2], [x13], #0x1\n"
+ "ld1 { v0.b }[2], [x10], #0x1\n"
+ "ld1 { v7.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v30.b }[2], [x27], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b25, [x14], #0x1\n"
- "ldr b24, [x13], #0x1\n"
- "ldr b23, [x10], #0x1\n"
- "ldr b22, [x9], #0x1\n"
- "ldr b20, [x28], #0x1\n"
- "ldr b19, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b17, [x25], #0x1\n"
+ "ldr b5, [x14], #0x1\n"
+ "ldr b29, [x13], #0x1\n"
+ "ldr b0, [x10], #0x1\n"
+ "ldr b7, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b30, [x27], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b1, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q29, [%x[params], #0x10]\n"
- "ldr q28, [%x[params], #0x20]\n"
- "zip2 v2.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "ldr q27, [%x[params], #0x30]\n"
- "zip1 v4.16b, v5.16b, v3.16b\n"
- "zip2 v3.16b, v5.16b, v3.16b\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "zip2 v18.16b, v11.16b, v3.16b\n"
+ "zip1 v11.16b, v11.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x30]\n"
+ "zip1 v17.16b, v10.16b, v14.16b\n"
+ "zip2 v14.16b, v10.16b, v14.16b\n"
"cmp x20, #0x4\n"
- "zip2 v5.16b, v6.16b, v4.16b\n"
- "zip1 v6.16b, v6.16b, v4.16b\n"
- "zip1 v4.16b, v2.16b, v3.16b\n"
- "zip2 v3.16b, v2.16b, v3.16b\n"
- "ldr q2, [%x[params], #0x0]\n"
- "zip2 v26.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v0.16b, v30.16b\n"
- "zip2 v30.16b, v0.16b, v30.16b\n"
- "zip2 v21.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v24.16b, v22.16b\n"
- "zip2 v22.16b, v24.16b, v22.16b\n"
- "zip2 v16.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v19.16b, v17.16b\n"
- "zip2 v17.16b, v19.16b, v17.16b\n"
- "zip2 v0.16b, v1.16b, v31.16b\n"
- "zip1 v1.16b, v1.16b, v31.16b\n"
- "zip1 v31.16b, v26.16b, v30.16b\n"
- "zip2 v30.16b, v26.16b, v30.16b\n"
- "zip2 v24.16b, v25.16b, v23.16b\n"
- "zip1 v25.16b, v25.16b, v23.16b\n"
- "zip1 v23.16b, v21.16b, v22.16b\n"
- "zip2 v22.16b, v21.16b, v22.16b\n"
- "zip2 v19.16b, v20.16b, v18.16b\n"
- "zip1 v20.16b, v20.16b, v18.16b\n"
- "zip1 v18.16b, v16.16b, v17.16b\n"
- "zip2 v17.16b, v16.16b, v17.16b\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
- ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
- ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
- ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
- ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "zip1 v3.16b, v18.16b, v14.16b\n"
+ "zip2 v14.16b, v18.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v22.16b, v13.16b, v21.16b\n"
+ "zip1 v13.16b, v13.16b, v21.16b\n"
+ "zip1 v21.16b, v28.16b, v27.16b\n"
+ "zip2 v27.16b, v28.16b, v27.16b\n"
+ "zip2 v20.16b, v5.16b, v0.16b\n"
+ "zip1 v5.16b, v5.16b, v0.16b\n"
+ "zip1 v19.16b, v29.16b, v7.16b\n"
+ "zip2 v7.16b, v29.16b, v7.16b\n"
+ "zip2 v18.16b, v16.16b, v2.16b\n"
+ "zip1 v16.16b, v16.16b, v2.16b\n"
+ "zip1 v17.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip2 v28.16b, v13.16b, v21.16b\n"
+ "zip1 v13.16b, v13.16b, v21.16b\n"
+ "zip1 v21.16b, v22.16b, v27.16b\n"
+ "zip2 v27.16b, v22.16b, v27.16b\n"
+ "zip2 v29.16b, v5.16b, v19.16b\n"
+ "zip1 v5.16b, v5.16b, v19.16b\n"
+ "zip1 v0.16b, v20.16b, v7.16b\n"
+ "zip2 v7.16b, v20.16b, v7.16b\n"
+ "zip2 v30.16b, v16.16b, v17.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v1.16b\n"
+ "zip2 v1.16b, v18.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ ".inst 0x4e8d9732 // sdot v18.4s, v25.16b, v13.16b\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8b973f // sdot v31.4s, v25.16b, v11.16b\n"
+ ".inst 0x4e8d971f // sdot v31.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b973a // sdot v26.4s, v25.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4e8d9724 // sdot v4.4s, v25.16b, v13.16b\n"
+ ".inst 0x4e859712 // sdot v18.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e8596ff // sdot v31.4s, v23.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d971a // sdot v26.4s, v24.16b, v13.16b\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e859704 // sdot v4.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e9096f2 // sdot v18.4s, v23.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
- ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8596fa // sdot v26.4s, v23.16b, v5.16b\n"
+ ".inst 0x4e9096e4 // sdot v4.4s, v23.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 20f\n"
- "str s2, [x24, x11]\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 21f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
"ble 35f\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr q28, [%x[params], #0x20]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
- ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8a96ff // sdot v31.4s, v23.16b, v10.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e9c96f2 // sdot v18.4s, v23.16b, v28.16b\n"
+ ".inst 0x4e9c96df // sdot v31.4s, v22.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a96fa // sdot v26.4s, v23.16b, v10.16b\n"
"cmp x20, #0x4\n"
- ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
- ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
+ ".inst 0x4e9c96e4 // sdot v4.4s, v23.16b, v28.16b\n"
+ ".inst 0x4e9d96d2 // sdot v18.4s, v22.16b, v29.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
- ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
- ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e9d961f // sdot v31.4s, v16.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c96da // sdot v26.4s, v22.16b, v28.16b\n"
+ ".inst 0x4e9d96c4 // sdot v4.4s, v22.16b, v29.16b\n"
+ ".inst 0x4e9e9612 // sdot v18.4s, v16.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9e9604 // sdot v4.4s, v16.16b, v30.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 24f\n"
- "str s2, [x24, x11]\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 25f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
"ble 35f\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr q28, [%x[params], #0x20]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
- ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e9596f2 // sdot v18.4s, v23.16b, v21.16b\n"
+ ".inst 0x4e9596df // sdot v31.4s, v22.16b, v21.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e8396fa // sdot v26.4s, v23.16b, v3.16b\n"
"cmp x20, #0x4\n"
- ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
- ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e9596e4 // sdot v4.4s, v23.16b, v21.16b\n"
+ ".inst 0x4e8096d2 // sdot v18.4s, v22.16b, v0.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
- ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
- ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
- ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e80961f // sdot v31.4s, v16.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e9596da // sdot v26.4s, v22.16b, v21.16b\n"
+ ".inst 0x4e8096c4 // sdot v4.4s, v22.16b, v0.16b\n"
+ ".inst 0x4e829612 // sdot v18.4s, v16.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e80961a // sdot v26.4s, v16.16b, v0.16b\n"
+ ".inst 0x4e829604 // sdot v4.4s, v16.16b, v2.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 28f\n"
- "str s2, [x24, x11]\n"
- "str s26, [x23, x11]\n"
- "str s21, [x22, x11]\n"
- "str s16, [x21, x11]\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 29f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
-
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
"ble 35f\n"
- "ldr q2, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v26.16b, v2.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr q28, [%x[params], #0x20]\n"
- "ldr q27, [%x[params], #0x30]\n"
- "mov v16.16b, v2.16b\n"
- ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
- "ldr q6, [%x[params], #0x40]\n"
- "ldr q1, [%x[params], #0x50]\n"
- ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
- ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8e969f // sdot v31.4s, v20.16b, v14.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q22, [%x[params], #0x50]\n"
+ ".inst 0x4e9b9692 // sdot v18.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e969a // sdot v26.4s, v20.16b, v14.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
- ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
- ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
- ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
- ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "sqrdmulh v2.4s, v2.4s, v6.4s\n"
- ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
- ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
- "and v29.16b, v2.16b, v1.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sqadd v2.4s, v2.4s, v29.4s\n"
- "and v28.16b, v26.16b, v1.16b\n"
- "and v27.16b, v21.16b, v1.16b\n"
- "and v29.16b, v16.16b, v1.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "srshl v2.4s, v2.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v1.4s\n"
- "add v2.4s, v2.4s, v7.4s\n"
- "add v26.4s, v26.4s, v7.4s\n"
- "add v21.4s, v21.4s, v7.4s\n"
- "add v16.4s, v16.4s, v7.4s\n"
- "smax v2.4s, v2.4s, v9.4s\n"
- "smax v26.4s, v26.4s, v9.4s\n"
- "smax v21.4s, v21.4s, v9.4s\n"
- "smax v16.4s, v16.4s, v9.4s\n"
- "smin v2.4s, v2.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v8.4s\n"
- "smin v21.4s, v21.4s, v8.4s\n"
- "smin v16.4s, v16.4s, v8.4s\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ ".inst 0x4e9b9684 // sdot v4.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e879672 // sdot v18.4s, v19.16b, v7.16b\n"
+ ".inst 0x4e87961f // sdot v31.4s, v16.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b967a // sdot v26.4s, v19.16b, v27.16b\n"
+ ".inst 0x4e879664 // sdot v4.4s, v19.16b, v7.16b\n"
+ ".inst 0x4e819612 // sdot v18.4s, v16.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e87961a // sdot v26.4s, v16.16b, v7.16b\n"
+ ".inst 0x4e819604 // sdot v4.4s, v16.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v23.16b, v26.16b, v22.16b\n"
+ "and v17.16b, v18.16b, v22.16b\n"
+ "and v16.16b, v4.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v23.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "srshl v4.4s, v4.4s, v22.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz x20, #1, 33f\n"
- "st1 { v2.h }[0], [x24], #0x2\n"
- "st1 { v26.h }[0], [x23], #0x2\n"
- "st1 { v21.h }[0], [x22], #0x2\n"
- "st1 { v16.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v2.b }[2], [x24], #0x1\n"
- "st1 { v26.b }[2], [x23], #0x1\n"
- "st1 { v21.b }[2], [x22], #0x1\n"
- "st1 { v16.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v2.b }[0], [x24], #0x1\n"
- "st1 { v26.b }[0], [x23], #0x1\n"
- "st1 { v21.b }[0], [x22], #0x1\n"
- "st1 { v16.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
-
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 8366b0a270..bea97a54b6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,7 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const uint8_t *const * const, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *const);
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 986937f3b4..5a28daffbf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,15 +30,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const uint8_t *const *const inptrs,
- const uint8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- uint8_t *const *const outptrs
-)
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
{
__asm__ __volatile__(
"mov x20, #0x1\n"
@@ -47,817 +39,817 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"orr x20, x20, #0x10000\n"
"lsr x11, %x[n_channels], #0x4\n"
- "dup v14.4s, w20\n"
+ "dup v12.4s, w20\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v13.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"mov x28, #0x0\n"
"mov x27, #0x0\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
"cbz x11, 3f\n"
- "ldr q9, [x15, x28]\n"
- "ldr q8, [x14, x28]\n"
- "subs x11, x11, #0x1\n"
- "ldr q7, [x13, x28]\n"
- "ldr q6, [x12, x28]\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q4, [x10, x28]\n"
- "ldr q3, [x9, x28]\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "ldr q2, [x26, x28]\n"
- "ldr q1, [x25, x28]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
+ "ldr q15, [x15, x28]\n"
"ldr q28, [x14, x28]\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "ldr q23, [x9, x28]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q30, [x13, x28]\n"
+ "ldr q8, [x12, x28]\n"
+ "zip2 v19.16b, v15.16b, v30.16b\n"
+ "zip1 v15.16b, v15.16b, v30.16b\n"
+ "ldr q26, [x10, x28]\n"
+ "ldr q0, [x9, x28]\n"
+ "zip1 v7.16b, v28.16b, v8.16b\n"
+ "zip2 v8.16b, v28.16b, v8.16b\n"
+ "ldr q29, [x26, x28]\n"
+ "ldr q10, [x21, x28]\n"
+ "zip2 v25.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v19.16b, v8.16b\n"
+ "zip2 v8.16b, v19.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x30]\n"
+ "zip2 v21.16b, v26.16b, v29.16b\n"
+ "zip1 v26.16b, v26.16b, v29.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "zip1 v27.16b, v0.16b, v10.16b\n"
+ "zip2 v10.16b, v0.16b, v10.16b\n"
+ "ldr q17, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v23.16b, v26.16b, v27.16b\n"
+ "zip1 v26.16b, v26.16b, v27.16b\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "zip2 v28.16b, v22.16b, v9.16b\n"
+ "zip1 v22.16b, v22.16b, v9.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "zip1 v24.16b, v17.16b, v5.16b\n"
+ "zip2 v5.16b, v17.16b, v5.16b\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v3.16b, v21.16b, v10.16b\n"
+ "zip2 v10.16b, v21.16b, v10.16b\n"
+ "ldr q4, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "zip2 v17.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v4.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v19.16b, v22.16b, v24.16b\n"
+ "zip1 v22.16b, v22.16b, v24.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v2.16b, v17.16b, v9.16b\n"
+ "zip2 v9.16b, v17.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
"add x28, x28, #0x10\n"
- ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"subs x11, x11, #0x1\n"
- ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x10]\n"
- ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q26, [%x[params], #0x10]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v15.16b, v31.16b, v26.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v21.16b, v29.16b, v26.16b\n"
+ "and v17.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e979596 // udot v22.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x6e939596 // udot v22.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v6.16b, v22.16b\n .inst 0x6e989586 // udot v6.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v30.16b, v26.16b\n"
+ ".inst 0x6e999596 // udot v22.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v29.16b, v26.16b\n"
+ "mov v21.16b, v26.16b\n"
+ ".inst 0x6e9995fa // udot v26.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e9795fd // udot v29.4s, v15.16b, v23.16b\n"
+ ".inst 0x6e97965a // udot v26.4s, v18.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
- "ldr q3, [x9, x28]\n"
- ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
- "ldr q8, [x14, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x6e9995fe // udot v30.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e9795f5 // udot v21.4s, v15.16b, v23.16b\n"
+ ".inst 0x6e97959c // udot v28.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e93965d // udot v29.4s, v18.16b, v19.16b\n"
+ ".inst 0x6e93977a // udot v26.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97965e // udot v30.4s, v18.16b, v23.16b\n"
+ "ldr q4, [x9, x28]\n"
+ ".inst 0x6e939655 // udot v21.4s, v18.16b, v19.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e93959c // udot v28.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e98977d // udot v29.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e93977e // udot v30.4s, v27.16b, v19.16b\n"
+ ".inst 0x6e989775 // udot v21.4s, v27.16b, v24.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "mov v17.16b, v28.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e99959c // udot v28.4s, v12.16b, v25.16b\n"
+ "ldr q31, [x14, x28]\n"
+ "mls v30.4s, v28.4s, v16.4s\n"
+ "mls v29.4s, v6.4s, v16.4s\n"
+ "mls v21.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v30.16b, v20.16b\n"
+ "and v6.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v21.16b, v20.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x90]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v21.4s, v21.4s, v20.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e839596 // udot v22.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809596 // udot v22.4s, v12.16b, v0.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x80]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "mov v18.16b, v22.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ ".inst 0x6e879596 // udot v22.4s, v12.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v6.16b, v26.16b\n"
+ "str s21, [x22, x27]\n"
+ "mov v25.16b, v26.16b\n"
+ "mov v20.16b, v26.16b\n"
+ ".inst 0x6e8795fa // udot v26.4s, v15.16b, v7.16b\n"
+ ".inst 0x6e8395f9 // udot v25.4s, v15.16b, v3.16b\n"
+ ".inst 0x6e83979a // udot v26.4s, v28.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e8795e6 // udot v6.4s, v15.16b, v7.16b\n"
+ ".inst 0x6e8395f4 // udot v20.4s, v15.16b, v3.16b\n"
+ ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809799 // udot v25.4s, v28.16b, v0.16b\n"
+ ".inst 0x6e80971a // udot v26.4s, v24.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e839786 // udot v6.4s, v28.16b, v3.16b\n"
+ "ldr q19, [x26, x28]\n"
+ ".inst 0x6e809794 // udot v20.4s, v28.16b, v0.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e829719 // udot v25.4s, v24.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
- "ldr q2, [x26, x28]\n"
- ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
- "ldr q7, [x13, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x6e809706 // udot v6.4s, v24.16b, v0.16b\n"
+ ".inst 0x6e829714 // udot v20.4s, v24.16b, v2.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "mov v17.16b, v23.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
+ "ldr q21, [x13, x28]\n"
+ "mls v6.4s, v23.4s, v16.4s\n"
+ "mls v25.4s, v18.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q15, [%x[params], #0x120]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v6.16b, v1.16b\n"
+ "and v22.16b, v25.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v6.4s, v6.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v6.4s, v6.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v6.4s, v6.4s, v13.4s\n"
"smax v25.4s, v25.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
+ "smin v6.4s, v6.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x6e8a9580 // udot v0.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e859580 // udot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "mov v22.16b, v0.16b\n .inst 0x6e899596 // udot v22.4s, v12.16b, v9.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
+ "str s6, [x24, x27]\n"
+ ".inst 0x6e889580 // udot v0.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s25, [x23, x27]\n"
+ "mov v29.16b, v28.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v25.16b, v28.16b\n"
+ "mov v7.16b, v28.16b\n"
+ ".inst 0x6e88971c // udot v28.4s, v24.16b, v8.16b\n"
+ ".inst 0x6e8a9719 // udot v25.4s, v24.16b, v10.16b\n"
+ ".inst 0x6e8a97dc // udot v28.4s, v30.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
"movi v17.4s, #0x0\n"
- ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
- "ldr q1, [x25, x28]\n"
- ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
- "ldr q6, [x12, x28]\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [x15, x28]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q29, [x15, x28]\n"
- "ldr q28, [x14, x28]\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "ldr q27, [x13, x28]\n"
- "ldr q26, [x12, x28]\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x160]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x170]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x150]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [x10, x28]\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldr q24, [x10, x28]\n"
- "ldr q23, [x9, x28]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "ldr q22, [x26, x28]\n"
- "ldr q21, [x25, x28]\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
+ ".inst 0x6e88971d // udot v29.4s, v24.16b, v8.16b\n"
+ ".inst 0x6e8a9707 // udot v7.4s, v24.16b, v10.16b\n"
+ ".inst 0x6e8a9591 // udot v17.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8597d9 // udot v25.4s, v30.16b, v5.16b\n"
+ ".inst 0x6e85977c // udot v28.4s, v27.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a97dd // udot v29.4s, v30.16b, v10.16b\n"
+ "ldr q10, [x21, x28]\n"
+ ".inst 0x6e8597c7 // udot v7.4s, v30.16b, v5.16b\n"
+ "mls v28.4s, v0.4s, v16.4s\n"
+ ".inst 0x6e859591 // udot v17.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e899779 // udot v25.4s, v27.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e85977d // udot v29.4s, v27.16b, v5.16b\n"
+ ".inst 0x6e899767 // udot v7.4s, v27.16b, v9.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+ "mov v18.16b, v17.16b\n .inst 0x6e899592 // udot v18.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889591 // udot v17.4s, v12.16b, v8.16b\n"
+ "ldr q8, [x12, x28]\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v25.4s, v22.4s, v16.4s\n"
+ "mls v7.4s, v18.4s, v16.4s\n"
+ "and v17.16b, v28.16b, v23.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x15, x28]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "ldr q3, [x20, x28]\n"
+ "and v24.16b, v29.16b, v23.16b\n"
+ "and v20.16b, v25.16b, v23.16b\n"
+ "and v17.16b, v7.16b, v23.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q2, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "sqadd v29.4s, v29.4s, v24.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x170]\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x150]\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v23.4s\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "srshl v7.4s, v7.4s, v23.4s\n"
+ "ldr q26, [x10, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v7.4s, v7.4s, v14.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
"smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v7.4s, v7.4s, v13.4s\n"
"ldp x13, x12, [%x[inptrs], #0x10]\n"
"ldp x10, x9, [%x[inptrs], #0x20]\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "smin v7.4s, v7.4s, v11.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s28, [x25, x27]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "zip2 v17.16b, v15.16b, v21.16b\n"
+ "zip1 v15.16b, v15.16b, v21.16b\n"
+ "zip1 v18.16b, v31.16b, v8.16b\n"
+ "zip2 v8.16b, v31.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "str s20, [x21, x27]\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
+ "str s29, [x24, x27]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str s25, [x23, x27]\n"
+ "zip2 v25.16b, v15.16b, v18.16b\n"
+ "str s7, [x22, x27]\n"
+ "zip1 v15.16b, v15.16b, v18.16b\n"
+ "zip1 v7.16b, v17.16b, v8.16b\n"
"add x27, x27, #0x4\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x140]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip2 v8.16b, v17.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v29.16b, v26.16b, v19.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
+ "zip1 v26.16b, v26.16b, v19.16b\n"
+ "zip1 v28.16b, v4.16b, v10.16b\n"
+ "zip2 v10.16b, v4.16b, v10.16b\n"
+ "zip2 v24.16b, v22.16b, v2.16b\n"
+ "zip1 v22.16b, v22.16b, v2.16b\n"
+ "zip1 v21.16b, v3.16b, v5.16b\n"
+ "zip2 v5.16b, v3.16b, v5.16b\n"
+ "zip2 v18.16b, v27.16b, v23.16b\n"
+ "zip1 v27.16b, v27.16b, v23.16b\n"
+ "zip1 v17.16b, v30.16b, v9.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "zip2 v23.16b, v26.16b, v28.16b\n"
+ "zip1 v26.16b, v26.16b, v28.16b\n"
+ "zip1 v3.16b, v29.16b, v10.16b\n"
+ "zip2 v10.16b, v29.16b, v10.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v0.16b, v24.16b, v5.16b\n"
+ "zip2 v5.16b, v24.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v17.16b\n"
+ "zip1 v27.16b, v27.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
- ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
- ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
"add x28, x28, #0x10\n"
- ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
"ldr q4, [%x[params], #0x10]\n"
- ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
- ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x0]\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
- ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x60]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x40]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x50]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x30]\n"
- "add v5.4s, v5.4s, v10.4s\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v27.16b, v31.16b, v4.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v27.4s\n"
+ "and v20.16b, v30.16b, v4.16b\n"
+ "and v18.16b, v29.16b, v4.16b\n"
+ "and v17.16b, v28.16b, v4.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
"ldr q4, [%x[params], #0x70]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x20]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
- ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v22.16b, v1.16b\n .inst 0x6e989596 // udot v22.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v29.16b, v31.16b\n"
+ ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x6e9994df // udot v31.4s, v6.16b, v25.16b\n"
+ ".inst 0x6e9794d5 // udot v21.4s, v6.16b, v23.16b\n"
+ ".inst 0x6e97977f // udot v31.4s, v27.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0xc0]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0x90]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e9994dd // udot v29.4s, v6.16b, v25.16b\n"
+ ".inst 0x6e9794d4 // udot v20.4s, v6.16b, v23.16b\n"
+ ".inst 0x6e979592 // udot v18.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e939775 // udot v21.4s, v27.16b, v19.16b\n"
+ ".inst 0x6e93975f // udot v31.4s, v26.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
+ ".inst 0x6e939774 // udot v20.4s, v27.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e939592 // udot v18.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e989755 // udot v21.4s, v26.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x6e989754 // udot v20.4s, v26.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e999592 // udot v18.4s, v12.16b, v25.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v4.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v29.16b, v4.16b\n"
+ "and v18.16b, v21.16b, v4.16b\n"
+ "and v17.16b, v20.16b, v4.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q26, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "ldr q25, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x90]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
"srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0xd0]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0x80]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x80]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ "mov v22.16b, v23.16b\n .inst 0x6e829596 // udot v22.4s, v12.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x24, x27]\n"
+ ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
- ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "str s21, [x23, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v4.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x6e87971f // udot v31.4s, v24.16b, v7.16b\n"
+ ".inst 0x6e839704 // udot v4.4s, v24.16b, v3.16b\n"
+ ".inst 0x6e83975f // udot v31.4s, v26.16b, v3.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
"add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e879715 // udot v21.4s, v24.16b, v7.16b\n"
+ ".inst 0x6e839714 // udot v20.4s, v24.16b, v3.16b\n"
+ ".inst 0x6e839592 // udot v18.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809744 // udot v4.4s, v26.16b, v0.16b\n"
+ ".inst 0x6e80973f // udot v31.4s, v25.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e839755 // udot v21.4s, v26.16b, v3.16b\n"
+ ".inst 0x6e809754 // udot v20.4s, v26.16b, v0.16b\n"
+ "mls v31.4s, v23.4s, v16.4s\n"
+ ".inst 0x6e809592 // udot v18.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e829724 // udot v4.4s, v25.16b, v2.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "ldr q9, [%x[params], #0x120]\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q16, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "ldr q31, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "ldr q0, [%x[params], #0xf0]\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x130]\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ ".inst 0x6e809735 // udot v21.4s, v25.16b, v0.16b\n"
+ ".inst 0x6e829734 // udot v20.4s, v25.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879592 // udot v18.4s, v12.16b, v7.16b\n"
+ "mls v21.4s, v18.4s, v16.4s\n"
+ "mls v4.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v21.16b, v1.16b\n"
+ "and v18.16b, v4.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "ldr q29, [%x[params], #0x100]\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v4.4s, v4.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q26, [%x[params], #0x130]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v4.4s, v4.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v4.4s, v4.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s5, [x24, x27]\n"
- "ldr q5, [%x[params], #0xe0]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6e8a9599 // udot v25.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e859599 // udot v25.4s, v12.16b, v5.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q24, [%x[params], #0xe0]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
+ "mov v23.16b, v25.16b\n .inst 0x6e899597 // udot v23.4s, v12.16b, v9.16b\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
- ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s21, [x24, x27]\n"
+ ".inst 0x6e889599 // udot v25.4s, v12.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "mov v30.16b, v5.16b\n"
- "str s20, [x21, x27]\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
- ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "str s4, [x23, x27]\n"
+ "mov v22.16b, v24.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v21.16b, v24.16b\n"
+ "mov v20.16b, v24.16b\n"
+ ".inst 0x6e889778 // udot v24.4s, v27.16b, v8.16b\n"
+ ".inst 0x6e8a9775 // udot v21.4s, v27.16b, v10.16b\n"
+ ".inst 0x6e8a97b8 // udot v24.4s, v29.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
"add x27, x27, #0x4\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e889776 // udot v22.4s, v27.16b, v8.16b\n"
+ ".inst 0x6e8a9774 // udot v20.4s, v27.16b, v10.16b\n"
+ ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8597b5 // udot v21.4s, v29.16b, v5.16b\n"
+ ".inst 0x6e859798 // udot v24.4s, v28.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a97b6 // udot v22.4s, v29.16b, v10.16b\n"
+ ".inst 0x6e8597b4 // udot v20.4s, v29.16b, v5.16b\n"
+ "mls v24.4s, v25.4s, v16.4s\n"
+ ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e899795 // udot v21.4s, v28.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e859796 // udot v22.4s, v28.16b, v5.16b\n"
+ ".inst 0x6e899794 // udot v20.4s, v28.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
+ "mls v22.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v23.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v26.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v17.16b, v20.16b, v26.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v20.4s, v20.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
"smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str s5, [x24, x27]\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s30, [x23, x27]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x27]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x24, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s21, [x23, x27]\n"
+ "str s20, [x22, x27]\n"
"add x27, x27, #0x4\n"
"beq 35f\n"
"3:" // Oddments
@@ -869,794 +861,794 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d9, [x15], #0x8\n"
- "ldr d8, [x14], #0x8\n"
+ "ldr d15, [x15], #0x8\n"
+ "ldr d25, [x14], #0x8\n"
"ldr d7, [x13], #0x8\n"
- "ldr d6, [x12], #0x8\n"
- "ldr d4, [x10], #0x8\n"
- "ldr d3, [x9], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d1, [x25], #0x8\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d26, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d3, [x26], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.s }[2], [x15], #0x4\n"
- "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v15.s }[2], [x15], #0x4\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
"ld1 { v7.s }[2], [x13], #0x4\n"
- "ld1 { v6.s }[2], [x12], #0x4\n"
- "ld1 { v4.s }[2], [x10], #0x4\n"
- "ld1 { v3.s }[2], [x9], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v1.s }[2], [x25], #0x4\n"
+ "ld1 { v8.s }[2], [x12], #0x4\n"
+ "ld1 { v26.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v3.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.h }[6], [x15], #0x2\n"
- "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v15.h }[6], [x15], #0x2\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
"ld1 { v7.h }[6], [x13], #0x2\n"
- "ld1 { v6.h }[6], [x12], #0x2\n"
- "ld1 { v4.h }[6], [x10], #0x2\n"
- "ld1 { v3.h }[6], [x9], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v1.h }[6], [x25], #0x2\n"
+ "ld1 { v8.h }[6], [x12], #0x2\n"
+ "ld1 { v26.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v3.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[14], [x15], #0x1\n"
- "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v15.b }[14], [x15], #0x1\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
"ld1 { v7.b }[14], [x13], #0x1\n"
- "ld1 { v6.b }[14], [x12], #0x1\n"
- "ld1 { v4.b }[14], [x10], #0x1\n"
- "ld1 { v3.b }[14], [x9], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v1.b }[14], [x25], #0x1\n"
+ "ld1 { v8.b }[14], [x12], #0x1\n"
+ "ld1 { v26.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v3.b }[14], [x26], #0x1\n"
+ "ld1 { v10.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[12], [x15], #0x1\n"
- "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v15.b }[12], [x15], #0x1\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
"ld1 { v7.b }[12], [x13], #0x1\n"
- "ld1 { v6.b }[12], [x12], #0x1\n"
- "ld1 { v4.b }[12], [x10], #0x1\n"
- "ld1 { v3.b }[12], [x9], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v1.b }[12], [x25], #0x1\n"
+ "ld1 { v8.b }[12], [x12], #0x1\n"
+ "ld1 { v26.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v3.b }[12], [x26], #0x1\n"
+ "ld1 { v10.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.h }[4], [x15], #0x2\n"
- "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v15.h }[4], [x15], #0x2\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
"ld1 { v7.h }[4], [x13], #0x2\n"
- "ld1 { v6.h }[4], [x12], #0x2\n"
- "ld1 { v4.h }[4], [x10], #0x2\n"
- "ld1 { v3.h }[4], [x9], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v1.h }[4], [x25], #0x2\n"
+ "ld1 { v8.h }[4], [x12], #0x2\n"
+ "ld1 { v26.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v3.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[10], [x15], #0x1\n"
- "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v15.b }[10], [x15], #0x1\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
"ld1 { v7.b }[10], [x13], #0x1\n"
- "ld1 { v6.b }[10], [x12], #0x1\n"
- "ld1 { v4.b }[10], [x10], #0x1\n"
- "ld1 { v3.b }[10], [x9], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v1.b }[10], [x25], #0x1\n"
+ "ld1 { v8.b }[10], [x12], #0x1\n"
+ "ld1 { v26.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v3.b }[10], [x26], #0x1\n"
+ "ld1 { v10.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[8], [x15], #0x1\n"
- "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v15.b }[8], [x15], #0x1\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
"ld1 { v7.b }[8], [x13], #0x1\n"
- "ld1 { v6.b }[8], [x12], #0x1\n"
- "ld1 { v4.b }[8], [x10], #0x1\n"
- "ld1 { v3.b }[8], [x9], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v1.b }[8], [x25], #0x1\n"
+ "ld1 { v8.b }[8], [x12], #0x1\n"
+ "ld1 { v26.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v3.b }[8], [x26], #0x1\n"
+ "ld1 { v10.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s9, [x15], #0x4\n"
- "ldr s8, [x14], #0x4\n"
+ "ldr s15, [x15], #0x4\n"
+ "ldr s25, [x14], #0x4\n"
"ldr s7, [x13], #0x4\n"
- "ldr s6, [x12], #0x4\n"
- "ldr s4, [x10], #0x4\n"
- "ldr s3, [x9], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s8, [x12], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v9.h }[2], [x15], #0x2\n"
- "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v15.h }[2], [x15], #0x2\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
"ld1 { v7.h }[2], [x13], #0x2\n"
- "ld1 { v6.h }[2], [x12], #0x2\n"
- "ld1 { v4.h }[2], [x10], #0x2\n"
- "ld1 { v3.h }[2], [x9], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v8.h }[2], [x12], #0x2\n"
+ "ld1 { v26.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v3.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[6], [x15], #0x1\n"
- "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v15.b }[6], [x15], #0x1\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
"ld1 { v7.b }[6], [x13], #0x1\n"
- "ld1 { v6.b }[6], [x12], #0x1\n"
- "ld1 { v4.b }[6], [x10], #0x1\n"
- "ld1 { v3.b }[6], [x9], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v1.b }[6], [x25], #0x1\n"
+ "ld1 { v8.b }[6], [x12], #0x1\n"
+ "ld1 { v26.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v3.b }[6], [x26], #0x1\n"
+ "ld1 { v10.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[4], [x15], #0x1\n"
- "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v15.b }[4], [x15], #0x1\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
"ld1 { v7.b }[4], [x13], #0x1\n"
- "ld1 { v6.b }[4], [x12], #0x1\n"
- "ld1 { v4.b }[4], [x10], #0x1\n"
- "ld1 { v3.b }[4], [x9], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v1.b }[4], [x25], #0x1\n"
+ "ld1 { v8.b }[4], [x12], #0x1\n"
+ "ld1 { v26.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v3.b }[4], [x26], #0x1\n"
+ "ld1 { v10.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h9, [x15], #0x2\n"
- "ldr h8, [x14], #0x2\n"
+ "ldr h15, [x15], #0x2\n"
+ "ldr h25, [x14], #0x2\n"
"ldr h7, [x13], #0x2\n"
- "ldr h6, [x12], #0x2\n"
- "ldr h4, [x10], #0x2\n"
- "ldr h3, [x9], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h8, [x12], #0x2\n"
+ "ldr h26, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h10, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.b }[2], [x15], #0x1\n"
- "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v15.b }[2], [x15], #0x1\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
"ld1 { v7.b }[2], [x13], #0x1\n"
- "ld1 { v6.b }[2], [x12], #0x1\n"
- "ld1 { v4.b }[2], [x10], #0x1\n"
- "ld1 { v3.b }[2], [x9], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v1.b }[2], [x25], #0x1\n"
+ "ld1 { v8.b }[2], [x12], #0x1\n"
+ "ld1 { v26.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x26], #0x1\n"
+ "ld1 { v10.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b9, [x15], #0x1\n"
- "ldr b8, [x14], #0x1\n"
+ "ldr b15, [x15], #0x1\n"
+ "ldr b25, [x14], #0x1\n"
"ldr b7, [x13], #0x1\n"
- "ldr b6, [x12], #0x1\n"
- "ldr b4, [x10], #0x1\n"
- "ldr b3, [x9], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b1, [x25], #0x1\n"
+ "ldr b8, [x12], #0x1\n"
+ "ldr b26, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b3, [x26], #0x1\n"
+ "ldr b10, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x15, x14, [%x[inptrs], #0x40]\n"
"ldp x13, x12, [%x[inptrs], #0x50]\n"
"add x15, x15, x28\n"
"add x14, x14, x28\n"
"ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
"add x13, x13, x28\n"
"add x12, x12, x28\n"
"add x10, x10, x28\n"
"add x9, x9, x28\n"
"add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "add x21, x21, x28\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d29, [x15], #0x8\n"
- "ldr d28, [x14], #0x8\n"
- "ldr d27, [x13], #0x8\n"
- "ldr d26, [x12], #0x8\n"
- "ldr d24, [x10], #0x8\n"
- "ldr d23, [x9], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d22, [x15], #0x8\n"
+ "ldr d19, [x14], #0x8\n"
+ "ldr d0, [x13], #0x8\n"
+ "ldr d5, [x12], #0x8\n"
+ "ldr d27, [x10], #0x8\n"
+ "ldr d24, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d9, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v29.s }[2], [x15], #0x4\n"
- "ld1 { v28.s }[2], [x14], #0x4\n"
- "ld1 { v27.s }[2], [x13], #0x4\n"
- "ld1 { v26.s }[2], [x12], #0x4\n"
- "ld1 { v24.s }[2], [x10], #0x4\n"
- "ld1 { v23.s }[2], [x9], #0x4\n"
- "ld1 { v22.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v22.s }[2], [x15], #0x4\n"
+ "ld1 { v19.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x13], #0x4\n"
+ "ld1 { v5.s }[2], [x12], #0x4\n"
+ "ld1 { v27.s }[2], [x10], #0x4\n"
+ "ld1 { v24.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v29.h }[6], [x15], #0x2\n"
- "ld1 { v28.h }[6], [x14], #0x2\n"
- "ld1 { v27.h }[6], [x13], #0x2\n"
- "ld1 { v26.h }[6], [x12], #0x2\n"
- "ld1 { v24.h }[6], [x10], #0x2\n"
- "ld1 { v23.h }[6], [x9], #0x2\n"
- "ld1 { v22.h }[6], [x26], #0x2\n"
- "ld1 { v21.h }[6], [x25], #0x2\n"
+ "ld1 { v22.h }[6], [x15], #0x2\n"
+ "ld1 { v19.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x13], #0x2\n"
+ "ld1 { v5.h }[6], [x12], #0x2\n"
+ "ld1 { v27.h }[6], [x10], #0x2\n"
+ "ld1 { v24.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[14], [x15], #0x1\n"
- "ld1 { v28.b }[14], [x14], #0x1\n"
- "ld1 { v27.b }[14], [x13], #0x1\n"
- "ld1 { v26.b }[14], [x12], #0x1\n"
- "ld1 { v24.b }[14], [x10], #0x1\n"
- "ld1 { v23.b }[14], [x9], #0x1\n"
- "ld1 { v22.b }[14], [x26], #0x1\n"
- "ld1 { v21.b }[14], [x25], #0x1\n"
+ "ld1 { v22.b }[14], [x15], #0x1\n"
+ "ld1 { v19.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x13], #0x1\n"
+ "ld1 { v5.b }[14], [x12], #0x1\n"
+ "ld1 { v27.b }[14], [x10], #0x1\n"
+ "ld1 { v24.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v9.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[12], [x15], #0x1\n"
- "ld1 { v28.b }[12], [x14], #0x1\n"
- "ld1 { v27.b }[12], [x13], #0x1\n"
- "ld1 { v26.b }[12], [x12], #0x1\n"
- "ld1 { v24.b }[12], [x10], #0x1\n"
- "ld1 { v23.b }[12], [x9], #0x1\n"
- "ld1 { v22.b }[12], [x26], #0x1\n"
- "ld1 { v21.b }[12], [x25], #0x1\n"
+ "ld1 { v22.b }[12], [x15], #0x1\n"
+ "ld1 { v19.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x13], #0x1\n"
+ "ld1 { v5.b }[12], [x12], #0x1\n"
+ "ld1 { v27.b }[12], [x10], #0x1\n"
+ "ld1 { v24.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v9.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v29.h }[4], [x15], #0x2\n"
- "ld1 { v28.h }[4], [x14], #0x2\n"
- "ld1 { v27.h }[4], [x13], #0x2\n"
- "ld1 { v26.h }[4], [x12], #0x2\n"
- "ld1 { v24.h }[4], [x10], #0x2\n"
- "ld1 { v23.h }[4], [x9], #0x2\n"
- "ld1 { v22.h }[4], [x26], #0x2\n"
- "ld1 { v21.h }[4], [x25], #0x2\n"
+ "ld1 { v22.h }[4], [x15], #0x2\n"
+ "ld1 { v19.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x13], #0x2\n"
+ "ld1 { v5.h }[4], [x12], #0x2\n"
+ "ld1 { v27.h }[4], [x10], #0x2\n"
+ "ld1 { v24.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[10], [x15], #0x1\n"
- "ld1 { v28.b }[10], [x14], #0x1\n"
- "ld1 { v27.b }[10], [x13], #0x1\n"
- "ld1 { v26.b }[10], [x12], #0x1\n"
- "ld1 { v24.b }[10], [x10], #0x1\n"
- "ld1 { v23.b }[10], [x9], #0x1\n"
- "ld1 { v22.b }[10], [x26], #0x1\n"
- "ld1 { v21.b }[10], [x25], #0x1\n"
+ "ld1 { v22.b }[10], [x15], #0x1\n"
+ "ld1 { v19.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x13], #0x1\n"
+ "ld1 { v5.b }[10], [x12], #0x1\n"
+ "ld1 { v27.b }[10], [x10], #0x1\n"
+ "ld1 { v24.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v9.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[8], [x15], #0x1\n"
- "ld1 { v28.b }[8], [x14], #0x1\n"
- "ld1 { v27.b }[8], [x13], #0x1\n"
- "ld1 { v26.b }[8], [x12], #0x1\n"
- "ld1 { v24.b }[8], [x10], #0x1\n"
- "ld1 { v23.b }[8], [x9], #0x1\n"
- "ld1 { v22.b }[8], [x26], #0x1\n"
- "ld1 { v21.b }[8], [x25], #0x1\n"
+ "ld1 { v22.b }[8], [x15], #0x1\n"
+ "ld1 { v19.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x13], #0x1\n"
+ "ld1 { v5.b }[8], [x12], #0x1\n"
+ "ld1 { v27.b }[8], [x10], #0x1\n"
+ "ld1 { v24.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v9.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s29, [x15], #0x4\n"
- "ldr s28, [x14], #0x4\n"
- "ldr s27, [x13], #0x4\n"
- "ldr s26, [x12], #0x4\n"
- "ldr s24, [x10], #0x4\n"
- "ldr s23, [x9], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s0, [x13], #0x4\n"
+ "ldr s5, [x12], #0x4\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s24, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s9, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v29.h }[2], [x15], #0x2\n"
- "ld1 { v28.h }[2], [x14], #0x2\n"
- "ld1 { v27.h }[2], [x13], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v24.h }[2], [x10], #0x2\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "ld1 { v22.h }[2], [x26], #0x2\n"
- "ld1 { v21.h }[2], [x25], #0x2\n"
+ "ld1 { v22.h }[2], [x15], #0x2\n"
+ "ld1 { v19.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x13], #0x2\n"
+ "ld1 { v5.h }[2], [x12], #0x2\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[6], [x15], #0x1\n"
- "ld1 { v28.b }[6], [x14], #0x1\n"
- "ld1 { v27.b }[6], [x13], #0x1\n"
- "ld1 { v26.b }[6], [x12], #0x1\n"
- "ld1 { v24.b }[6], [x10], #0x1\n"
- "ld1 { v23.b }[6], [x9], #0x1\n"
- "ld1 { v22.b }[6], [x26], #0x1\n"
- "ld1 { v21.b }[6], [x25], #0x1\n"
+ "ld1 { v22.b }[6], [x15], #0x1\n"
+ "ld1 { v19.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x13], #0x1\n"
+ "ld1 { v5.b }[6], [x12], #0x1\n"
+ "ld1 { v27.b }[6], [x10], #0x1\n"
+ "ld1 { v24.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v9.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[4], [x15], #0x1\n"
- "ld1 { v28.b }[4], [x14], #0x1\n"
- "ld1 { v27.b }[4], [x13], #0x1\n"
- "ld1 { v26.b }[4], [x12], #0x1\n"
- "ld1 { v24.b }[4], [x10], #0x1\n"
- "ld1 { v23.b }[4], [x9], #0x1\n"
- "ld1 { v22.b }[4], [x26], #0x1\n"
- "ld1 { v21.b }[4], [x25], #0x1\n"
+ "ld1 { v22.b }[4], [x15], #0x1\n"
+ "ld1 { v19.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x13], #0x1\n"
+ "ld1 { v5.b }[4], [x12], #0x1\n"
+ "ld1 { v27.b }[4], [x10], #0x1\n"
+ "ld1 { v24.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v9.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h29, [x15], #0x2\n"
- "ldr h28, [x14], #0x2\n"
- "ldr h27, [x13], #0x2\n"
- "ldr h26, [x12], #0x2\n"
- "ldr h24, [x10], #0x2\n"
- "ldr h23, [x9], #0x2\n"
- "ldr h22, [x26], #0x2\n"
- "ldr h21, [x25], #0x2\n"
+ "ldr h22, [x15], #0x2\n"
+ "ldr h19, [x14], #0x2\n"
+ "ldr h0, [x13], #0x2\n"
+ "ldr h5, [x12], #0x2\n"
+ "ldr h27, [x10], #0x2\n"
+ "ldr h24, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h9, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v29.b }[2], [x15], #0x1\n"
- "ld1 { v28.b }[2], [x14], #0x1\n"
- "ld1 { v27.b }[2], [x13], #0x1\n"
- "ld1 { v26.b }[2], [x12], #0x1\n"
- "ld1 { v24.b }[2], [x10], #0x1\n"
- "ld1 { v23.b }[2], [x9], #0x1\n"
- "ld1 { v22.b }[2], [x26], #0x1\n"
- "ld1 { v21.b }[2], [x25], #0x1\n"
+ "ld1 { v22.b }[2], [x15], #0x1\n"
+ "ld1 { v19.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x13], #0x1\n"
+ "ld1 { v5.b }[2], [x12], #0x1\n"
+ "ld1 { v27.b }[2], [x10], #0x1\n"
+ "ld1 { v24.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v9.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b29, [x15], #0x1\n"
- "ldr b28, [x14], #0x1\n"
- "ldr b27, [x13], #0x1\n"
- "ldr b26, [x12], #0x1\n"
- "ldr b24, [x10], #0x1\n"
- "ldr b23, [x9], #0x1\n"
- "ldr b22, [x26], #0x1\n"
- "ldr b21, [x25], #0x1\n"
+ "ldr b22, [x15], #0x1\n"
+ "ldr b19, [x14], #0x1\n"
+ "ldr b0, [x13], #0x1\n"
+ "ldr b5, [x12], #0x1\n"
+ "ldr b27, [x10], #0x1\n"
+ "ldr b24, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b9, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q0, [%x[params], #0x10]\n"
- "ldr q16, [%x[params], #0x20]\n"
- "zip2 v30.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "ldr q31, [%x[params], #0x30]\n"
- "zip1 v2.16b, v3.16b, v1.16b\n"
- "zip2 v5.16b, v9.16b, v7.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v1.16b, v26.16b, v3.16b\n"
+ "zip1 v26.16b, v26.16b, v3.16b\n"
+ "ldr q4, [%x[params], #0x30]\n"
+ "zip1 v18.16b, v23.16b, v10.16b\n"
+ "zip2 v30.16b, v15.16b, v7.16b\n"
"cmp x20, #0x4\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v8.16b, v6.16b\n"
- "zip2 v6.16b, v8.16b, v6.16b\n"
- "zip2 v1.16b, v3.16b, v1.16b\n"
- "zip2 v3.16b, v4.16b, v2.16b\n"
- "zip1 v4.16b, v4.16b, v2.16b\n"
- "zip2 v25.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v27.16b, v28.16b, v26.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
- "zip2 v8.16b, v9.16b, v7.16b\n"
- "zip1 v9.16b, v9.16b, v7.16b\n"
- "zip1 v7.16b, v5.16b, v6.16b\n"
- "zip2 v6.16b, v5.16b, v6.16b\n"
- "ldr q5, [%x[params], #0x0]\n"
- "zip2 v26.16b, v28.16b, v26.16b\n"
- "zip2 v20.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v23.16b, v21.16b\n"
- "zip2 v21.16b, v23.16b, v21.16b\n"
- "zip2 v28.16b, v29.16b, v27.16b\n"
- "zip1 v29.16b, v29.16b, v27.16b\n"
- "zip1 v2.16b, v30.16b, v1.16b\n"
- ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip1 v27.16b, v25.16b, v26.16b\n"
- "zip2 v26.16b, v25.16b, v26.16b\n"
- "zip2 v23.16b, v24.16b, v22.16b\n"
- "zip1 v24.16b, v24.16b, v22.16b\n"
- "zip1 v22.16b, v20.16b, v21.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
- "zip2 v21.16b, v20.16b, v21.16b\n"
- "mov v30.16b, v5.16b\n"
- ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
- "mov v25.16b, v5.16b\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
- ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
- "ext v4.16b, v4.16b, v4.16b, #0x1\n"
- "ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
- ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
- ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v29.16b, v25.16b, v8.16b\n"
+ "zip2 v8.16b, v25.16b, v8.16b\n"
+ "zip2 v10.16b, v23.16b, v10.16b\n"
+ "zip2 v23.16b, v26.16b, v18.16b\n"
+ "zip1 v26.16b, v26.16b, v18.16b\n"
+ "zip2 v28.16b, v22.16b, v0.16b\n"
+ "zip1 v22.16b, v22.16b, v0.16b\n"
+ "zip1 v21.16b, v19.16b, v5.16b\n"
"movi v17.4s, #0x0\n"
- ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
- ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
- ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
- "ldr q4, [%x[params], #0x50]\n"
- ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
+ ".inst 0x6e9a9591 // udot v17.4s, v12.16b, v26.16b\n"
+ "zip2 v25.16b, v15.16b, v29.16b\n"
+ "zip1 v15.16b, v15.16b, v29.16b\n"
+ "zip1 v7.16b, v30.16b, v8.16b\n"
+ "zip2 v8.16b, v30.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v5.16b, v19.16b, v5.16b\n"
+ "zip2 v30.16b, v27.16b, v2.16b\n"
+ "zip1 v27.16b, v27.16b, v2.16b\n"
+ "zip1 v18.16b, v24.16b, v9.16b\n"
+ "zip2 v9.16b, v24.16b, v9.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v3.16b, v1.16b, v10.16b\n"
+ ".inst 0x6e969591 // udot v17.4s, v12.16b, v22.16b\n"
+ "zip2 v10.16b, v1.16b, v10.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v18.16b\n"
+ "zip1 v27.16b, v27.16b, v18.16b\n"
+ "zip1 v2.16b, v30.16b, v9.16b\n"
+ "mov v18.16b, v17.16b\n .inst 0x6e9b9592 // udot v18.4s, v12.16b, v27.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ ".inst 0x6e8f9591 // udot v17.4s, v12.16b, v15.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e8f969f // udot v31.4s, v20.16b, v15.16b\n"
+ ".inst 0x6e9a969d // udot v29.4s, v20.16b, v26.16b\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "movi v1.4s, #0x0\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9581 // udot v1.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96949f // udot v31.4s, v4.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f969e // udot v30.4s, v20.16b, v15.16b\n"
+ ".inst 0x6e9a969c // udot v28.4s, v20.16b, v26.16b\n"
+ "mls v31.4s, v17.4s, v16.4s\n"
+ ".inst 0x6e969581 // udot v1.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b949d // udot v29.4s, v4.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mov v20.16b, v1.16b\n .inst 0x6e9b9594 // udot v20.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9581 // udot v1.4s, v12.16b, v15.16b\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ ".inst 0x6e96949e // udot v30.4s, v4.16b, v22.16b\n"
+ ".inst 0x6e9b949c // udot v28.4s, v4.16b, v27.16b\n"
+ "mls v30.4s, v1.4s, v16.4s\n"
"add %x[params], %x[params], #0x60\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "mls v28.4s, v20.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v26.16b, v28.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v26.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 20f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 21f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
- ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
- ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q4, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e99977f // udot v31.4s, v27.16b, v25.16b\n"
+ ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
+ "movi v20.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
- ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
- ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
- ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
- ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
- ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
+ ".inst 0x6e97975f // udot v31.4s, v26.16b, v23.16b\n"
+ "mov v18.16b, v1.16b\n .inst 0x6e989592 // udot v18.4s, v12.16b, v24.16b\n"
"ext v23.16b, v23.16b, v23.16b, #0x1\n"
- ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
- ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
- ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x6e99977e // udot v30.4s, v27.16b, v25.16b\n"
+ ".inst 0x6e97977c // udot v28.4s, v27.16b, v23.16b\n"
+ ".inst 0x6e979594 // udot v20.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x6e9396df // udot v31.4s, v22.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97975e // udot v30.4s, v26.16b, v23.16b\n"
+ ".inst 0x6e93975c // udot v28.4s, v26.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e939594 // udot v20.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e9896dd // udot v29.4s, v22.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e9396de // udot v30.4s, v22.16b, v19.16b\n"
+ ".inst 0x6e9896dc // udot v28.4s, v22.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "mov v17.16b, v20.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e999594 // udot v20.4s, v12.16b, v25.16b\n"
+ "mls v30.4s, v20.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v28.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 24f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 25f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x6e839598 // udot v24.4s, v12.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e87973f // udot v31.4s, v25.16b, v7.16b\n"
+ ".inst 0x6e809598 // udot v24.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e83973d // udot v29.4s, v25.16b, v3.16b\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
- ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
- ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
- "movi v17.4s, #0x0\n"
"cmp x20, #0x4\n"
- ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e8396ff // udot v31.4s, v23.16b, v3.16b\n"
+ "mov v18.16b, v24.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ ".inst 0x6e879598 // udot v24.4s, v12.16b, v7.16b\n"
"ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
- ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
- ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
- ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
- ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
- ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
- ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
- ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
- ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ ".inst 0x6e87973e // udot v30.4s, v25.16b, v7.16b\n"
+ ".inst 0x6e83973c // udot v28.4s, v25.16b, v3.16b\n"
+ ".inst 0x6e839593 // udot v19.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e8096fd // udot v29.4s, v23.16b, v0.16b\n"
+ ".inst 0x6e8096df // udot v31.4s, v22.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e8396fe // udot v30.4s, v23.16b, v3.16b\n"
+ ".inst 0x6e8096fc // udot v28.4s, v23.16b, v0.16b\n"
+ "mls v31.4s, v24.4s, v16.4s\n"
+ ".inst 0x6e809593 // udot v19.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e8296dd // udot v29.4s, v22.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e8096de // udot v30.4s, v22.16b, v0.16b\n"
+ ".inst 0x6e8296dc // udot v28.4s, v22.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "mov v17.16b, v19.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879593 // udot v19.4s, v12.16b, v7.16b\n"
+ "mls v30.4s, v19.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v28.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"blt 28f\n"
- "str s5, [x24, x27]\n"
- "str s30, [x23, x27]\n"
- "str s25, [x22, x27]\n"
- "str s20, [x21, x27]\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 29f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
"add x27, x27, #0x4\n"
"ble 35f\n"
- "ldr q5, [%x[params], #0x0]\n"
- "ldr q0, [%x[params], #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
- "ldr q16, [%x[params], #0x20]\n"
- "ldr q31, [%x[params], #0x30]\n"
- "mov v30.16b, v5.16b\n"
- "mov v25.16b, v5.16b\n"
- "ldr q9, [%x[params], #0x40]\n"
- "ldr q4, [%x[params], #0x50]\n"
- "mov v20.16b, v5.16b\n"
- ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
- ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
- ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
- "movi v17.4s, #0x0\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e8a9596 // udot v22.4s, v12.16b, v10.16b\n"
+ "ldr q21, [%x[params], #0x20]\n"
+ "ldr q19, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e8896ff // udot v31.4s, v23.16b, v8.16b\n"
+ ".inst 0x6e859596 // udot v22.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e8a96fd // udot v29.4s, v23.16b, v10.16b\n"
+ "movi v18.4s, #0x0\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
- "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
- ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
- ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
- ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
- ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
- ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
- "mls v5.4s, v19.4s, v11.4s\n"
- ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
- ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
- ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v9.4s\n"
- "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
- ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
- "mls v30.4s, v17.4s, v11.4s\n"
- "mls v25.4s, v18.4s, v11.4s\n"
- "mls v20.4s, v16.4s, v11.4s\n"
- "and v0.16b, v5.16b, v4.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqrdmulh v20.4s, v20.4s, v9.4s\n"
- "sqadd v5.4s, v5.4s, v0.4s\n"
- "and v16.16b, v30.16b, v4.16b\n"
- "and v31.16b, v25.16b, v4.16b\n"
- "and v0.16b, v20.16b, v4.16b\n"
+ ".inst 0x6e8a96bf // udot v31.4s, v21.16b, v10.16b\n"
+ "mov v17.16b, v22.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x6e889596 // udot v22.4s, v12.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x6e8896fe // udot v30.4s, v23.16b, v8.16b\n"
+ ".inst 0x6e8a96fc // udot v28.4s, v23.16b, v10.16b\n"
+ ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8596bd // udot v29.4s, v21.16b, v5.16b\n"
+ ".inst 0x6e85967f // udot v31.4s, v19.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a96be // udot v30.4s, v21.16b, v10.16b\n"
+ ".inst 0x6e8596bc // udot v28.4s, v21.16b, v5.16b\n"
+ "mls v31.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e89967d // udot v29.4s, v19.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e85967e // udot v30.4s, v19.16b, v5.16b\n"
+ ".inst 0x6e89967c // udot v28.4s, v19.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mov v7.16b, v18.16b\n .inst 0x6e899587 // udot v7.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v28.4s, v7.4s, v16.4s\n"
+ "and v16.16b, v31.16b, v26.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sqadd v25.4s, v25.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "srshl v5.4s, v5.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "smax v5.4s, v5.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v17.16b, v29.16b, v26.16b\n"
+ "and v16.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"smax v30.4s, v30.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x27\n"
"add x24, x24, x27\n"
"add x23, x23, x27\n"
"add x22, x22, x27\n"
- "add x21, x21, x27\n"
"tbz x20, #1, 33f\n"
- "st1 { v5.h }[0], [x24], #0x2\n"
- "st1 { v30.h }[0], [x23], #0x2\n"
- "st1 { v25.h }[0], [x22], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v5.b }[2], [x24], #0x1\n"
- "st1 { v30.b }[2], [x23], #0x1\n"
- "st1 { v25.b }[2], [x22], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v5.b }[0], [x24], #0x1\n"
- "st1 { v30.b }[0], [x23], #0x1\n"
- "st1 { v25.b }[0], [x22], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 49ef5dc0d9..9fc6a5bc34 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 15bbb31413..26fe4c8a10 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1072 +91,1072 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x7, x6, #0x3\n"
+ "lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v24.16b }, [x20]\n"
+ "ld1r { v14.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
- "mov x8, #0x0\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x12, x11, [x22, #0x0]\n"
- "ldp x10, x9, [x22, #0x10]\n"
- "cbz x7, 3f\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "subs x7, x7, #0x1\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d31, [x24, x8]\n"
- "ldr d30, [x23, x8]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d29, [x22, x8]\n"
- "ldr d28, [x21, x8]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr x20, [x16, #0x20]\n"
- "ldr d27, [x20, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x15, x15, #0x48\n"
- "subs x7, x7, #0x1\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
"add x13, x13, #0x20\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "add x17, x17, #0x8\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr d31, [x24, x8]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x23, x8]\n"
- "ldr d29, [x22, x8]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "ldr d28, [x21, x8]\n"
- "ldr x20, [x16, #0x20]\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "ldr d27, [x20, x8]\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "tst x6, #0x7\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
"add x13, x13, #0x20\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "add x17, x17, #0x8\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x15, x15, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x6, #2, 5f\n"
- "ld1 { v13.4s }, [x28], #0x10\n"
- "tbz x6, #1, 4f\n"
- "ld1 { v20.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[2], [x28]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x6, #1, 6f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[2], [x28]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x20]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "add x24, x24, x8\n"
- "add x23, x23, x8\n"
- "add x22, x22, x8\n"
- "add x21, x21, x8\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x6, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "add x21, x21, x8\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "tbz x6, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x6, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "add x20, x20, x8\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "tbz x6, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x6, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "add x28, x28, x8\n"
- "tbz x6, #2, 21f\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "tbz x6, #1, 20f\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[6], [x28]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[4], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x6, #1, 22f\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[2], [x28]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[0], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x27, [x16, #0x40]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "add x27, x27, x8\n"
- "tbz x6, #2, 25f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x6, #1, 24f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x6, #1, 26f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "add x26, x26, x8\n"
- "tbz x6, #2, 29f\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "tbz x6, #1, 28f\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[6], [x26]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[4], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x6, #1, 30f\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[2], [x26]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[0], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "add x25, x25, x8\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "tbz x6, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x6, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x6, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x24, [x16, #0x58]\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "add x24, x24, x8\n"
- "tbz x6, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x6, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x6, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "add x23, x23, x8\n"
- "tbz x6, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x6, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x6, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x22, [x16, #0x68]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "add x22, x22, x8\n"
- "tbz x6, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x6, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x6, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "add x21, x21, x8\n"
- "tbz x6, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x6, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x20, [x16, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x6, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "tbz x6, #2, 57f\n"
- "ld1 { v17.4s }, [x14], #0x10\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "tbz x6, #1, 56f\n"
- "ld1 { v23.d }[0], [x14], #0x8\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[0], [x14]\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x6, #1, 58f\n"
- "ld1 { v17.d }[0], [x14], #0x8\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[2], [x14]\n"
- "ld1 { v22.s }[2], [x13]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[0], [x14]\n"
- "ld1 { v22.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "add x12, x12, x17\n"
- "add x11, x11, x17\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x10, x10, x17\n"
- "add x9, x9, x17\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "tbz x6, #2, 61f\n"
- "st1 { v13.s }[0], [x12], #0x4\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
"st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v25.s }[0], [x9], #0x4\n"
- "tbz x6, #1, 60f\n"
- "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
"st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v25.h }[2], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v25.b }[6], [x9], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[4], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v25.b }[4], [x9], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x6, #1, 62f\n"
- "st1 { v13.h }[0], [x12], #0x2\n"
+ "tbz x7, #1, 62f\n"
"st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v25.h }[0], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v25.b }[2], [x9], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[0], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v25.b }[0], [x9], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 0baebafa3f..f4f2bc82e1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index de072a7d55..fb533893a6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,16 +104,16 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
@@ -123,563 +123,563 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "ldr d6, [x14, #0x30]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "usubl v5.8h, v5.8b, v13.8b\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v13.8b\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"add x14, x14, #0x48\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
"subs x8, x8, #0x1\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
"add x13, x13, #0x20\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
"sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "usubl v5.8h, v5.8b, v13.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "usubl v7.8h, v7.8b, v13.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d24, [x20, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr d25, [x27, x17]\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
"tst x7, #0x7\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v17.d }[0], [x24], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v3.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[0], [x24]\n"
+ "ld1 { v3.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x24]\n"
+ "ld1 { v5.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "usubl v5.8h, v5.8b, v13.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "usubl v7.8h, v7.8b, v13.8b\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -689,700 +689,700 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v12.8b\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x24, x24, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "add x25, x25, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v10.s }[0], [x10], #0x4\n"
- "st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v21.s }[0], [x28], #0x4\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v10.h }[2], [x10], #0x2\n"
- "st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v21.h }[2], [x28], #0x2\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v10.b }[6], [x10], #0x1\n"
- "st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v21.b }[6], [x28], #0x1\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v10.b }[4], [x10], #0x1\n"
- "st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v21.b }[4], [x28], #0x1\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v10.h }[0], [x10], #0x2\n"
- "st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v21.h }[0], [x28], #0x2\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v10.b }[2], [x10], #0x1\n"
- "st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v21.b }[2], [x28], #0x1\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v10.b }[0], [x10], #0x1\n"
- "st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v21.b }[0], [x28], #0x1\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 407807fcc1..375e6f8f15 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 2fe688a65e..ae663585a2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -112,1188 +112,1188 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x2, x1, #0x3\n"
- "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v9.16b }, [x3]\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
- "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x11]\n"
- "ld1r { v14.8h }, [x5]\n"
- "add x3, x13, %[offsetof_Requantize32_minval]\n"
- "add x15, x13, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x3]\n"
- "ld1r { v11.8h }, [x15]\n"
- "mov x0, #0x0\n"
- "mov x10, #0x0\n"
- "add x4, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x6, [x24, #0x0]\n"
- "ldp x7, x16, [x24, #0x10]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
"cbz x2, 3f\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
"subs x2, x2, #0x1\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ldr d31, [x9, x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldr d30, [x28, x0]\n"
- "ldr d29, [x27, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x26, x0]\n"
- "ldr d27, [x25, x0]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x24, x0]\n"
- "ldr d25, [x23, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x22, x0]\n"
- "ldr d26, [x21, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x20, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
"subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
"smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x25, [x4, #0xf0]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
"add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "add x3, x3, #0xc8\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "add x10, x10, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr d31, [x9, x0]\n"
- "ldr d30, [x28, x0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr d29, [x27, x0]\n"
- "ldr d28, [x26, x0]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d27, [x25, x0]\n"
- "ldr d23, [x24, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d25, [x23, x0]\n"
- "ldr d24, [x22, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d26, [x21, x0]\n"
- "ldr d22, [x20, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "tst x1, #0x7\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "add x5, x5, #0x20\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
"smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
"smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
"sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "add x10, x10, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x6, x6, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v13.4s }, [x13], #0x10\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v15.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ld1 { v7.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "ld1 { v7.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "add x9, x9, x0\n"
- "add x28, x28, x0\n"
- "add x27, x27, x0\n"
- "add x26, x26, x0\n"
- "add x25, x25, x0\n"
- "add x24, x24, x0\n"
- "add x23, x23, x0\n"
- "add x22, x22, x0\n"
- "add x21, x21, x0\n"
- "add x20, x20, x0\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 9f\n"
"ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
"ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
"ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
"ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[6], [x9]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
"ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[4], [x9]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
"ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
"ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
"ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[2], [x9]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
"ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[0], [x9]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
"ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "add x20, x20, x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x22, [x4, #0x58]\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "add x22, x22, x0\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
"tbz x1, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "add x21, x21, x0\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x3, #0x28]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x4, #0x68]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "ldr d14, [x6, #0x28]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
"tbz x1, #2, 25f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
@@ -1315,869 +1315,869 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 27f\n"
"ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "ldr d21, [x6, #0x30]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x25, [x4, #0x78]\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "ldr d9, [x6, #0x38]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "ldr d31, [x6, #0x40]\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x24, [x4, #0x88]\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "add x24, x24, x0\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "ldr d16, [x6, #0x48]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x4, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x0\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "add x14, x14, x0\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "ldr d2, [x6, #0x58]\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x4, #0xa8]\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "add x13, x13, x0\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "ldr d25, [x6, #0x60]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x12, x12, x0\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0xb8]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d16, [x6, #0x70]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v14.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v14.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v14.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v14.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "add x11, x11, x0\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d17, [x6, #0x78]\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x22, [x4, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x22, x22, x0\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 73f\n"
- "ld1 { v23.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v23.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "add x9, x9, x0\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x28, [x4, #0xd8]\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "add x28, x28, x0\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "ldr d12, [x6, #0x88]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "add x27, x27, x0\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "ldr d21, [x6, #0x90]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "ldr d8, [x6, #0x98]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "ldr d9, [x6, #0xa0]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
"tbz x1, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x1, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x0\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d12, [x6, #0xa8]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d28, [x6, #0xb0]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x21, x21, x0\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d30, [x6, #0xb8]\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d8, [x6, #0xc0]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v18.4s }, [x5], #0x10\n"
- "ld1 { v6.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v5.d }[0], [x5], #0x8\n"
- "ld1 { v22.d }[0], [x8], #0x8\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[2], [x5]\n"
- "ld1 { v22.s }[2], [x8]\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[0], [x5]\n"
- "ld1 { v22.s }[0], [x8]\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v6.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v6.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v6.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "add x17, x17, x10\n"
- "add x6, x6, x10\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x7, x7, x10\n"
- "add x16, x16, x10\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v13.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x7], #0x4\n"
- "st1 { v17.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v13.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x7], #0x2\n"
- "st1 { v17.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x7], #0x1\n"
- "st1 { v17.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x7], #0x1\n"
- "st1 { v17.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v13.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x7], #0x2\n"
- "st1 { v17.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x7], #0x1\n"
- "st1 { v17.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x7], #0x1\n"
- "st1 { v17.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
index b859978b1e..814efe006e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -47,4 +47,5 @@ class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKern
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 39001aa1fd..f7aa889b56 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
@@ -59,7 +60,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 6f\n"
+ "cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
@@ -67,34 +68,34 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr s16, [x28, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x26, x11]\n"
- "ldr s19, [x25, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr s20, [x24, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
"usubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -103,35 +104,35 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x28, x11]\n"
- "ldr s17, [x27, x11]\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldr s18, [x26, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x20, x20, #0x1\n"
+ "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x24, x11]\n"
+ "ldr s20, [x21, x11]\n"
"usubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -167,45 +168,45 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v24.4s, v24.4s, v2.4s\n"
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -270,7 +271,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"str s30, [x21, x11]\n"
"str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x12, LSL #2\n"
+ "cmp x11, x9, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -288,61 +289,61 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Bit 1: End
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
+ "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"usubl v0.8h, v0.8b, v5.8b\n"
- "add x28, x28, x11\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -356,62 +357,62 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x22, [x21], #0x8\n"
- "add x10, x10, x11\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
"usubl v0.8h, v0.8b, v5.8b\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -457,9 +458,7 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"cbz %x[rq_left_shift_ptr], 19f\n"
"ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
-
"20:" // Oddments: Load quantisation parameters: Bit 1: End
-
"21:" // Oddments: Load quantisation parameters: Done
"sshl v23.4s, v23.4s, v3.4s\n"
"sshl v24.4s, v24.4s, v3.4s\n"
@@ -473,11 +472,11 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"add x28, x28, x11\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
@@ -490,36 +489,36 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x21, x21, x11\n"
"add x20, x20, x11\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -606,15 +605,14 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.b }[0], [x21], #0x1\n"
"st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
-
"24:" // End
-
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 134f657fb8..76965606f7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -57,4 +57,5 @@ struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index a6dba90f9e..d69f391514 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -40,169 +41,169 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q14, [%x[params], #0x0]\n"
+ "ldr q11, [%x[params], #0x0]\n"
"ldr q5, [%x[params], #0x10]\n"
- "movi v15.16b, #0x1\n"
- "ushr v15.4s, v15.4s, #0x8\n"
+ "movi v8.16b, #0x1\n"
+ "ushr v8.4s, v8.4s, #0x8\n"
"ldr q6, [%x[params], #0x20]\n"
"ldr q7, [%x[params], #0x30]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr x20, [%x[inptrs], #0x8]\n"
"ld1 { v1.16b }, [x20]\n"
- "mov v29.16b, v1.16b\n"
- "mov v16.16b, v1.16b\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v23.16b, v1.16b\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1 { v2.16b }, [x20]\n"
- "mov v28.16b, v1.16b\n"
- "mov v22.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v21.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x20]\n"
"ld1 { v4.16b }, [x20]\n"
- "mov v31.16b, v2.16b\n"
- "mov v30.16b, v2.16b\n"
+ "mov v20.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1 { v0.16b }, [x20]\n"
- "mov v23.16b, v4.16b\n"
- "mov v21.16b, v4.16b\n"
+ "mov v9.16b, v4.16b\n"
+ "mov v22.16b, v4.16b\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1 { v3.16b }, [x20]\n"
- "mov v20.16b, v4.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x2\n"
- "ext v16.16b, v16.16b, v16.16b, #0x4\n"
- "ext v28.16b, v28.16b, v28.16b, #0x6\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x2\n"
- "ext v31.16b, v31.16b, v31.16b, #0x4\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
"ld1r { v12.4s }, [x20]\n"
- "ext v30.16b, v30.16b, v30.16b, #0x6\n"
- "ext v23.16b, v23.16b, v23.16b, #0x2\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x4\n"
- "ext v20.16b, v20.16b, v20.16b, #0x6\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x6\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v10.4s }, [x20]\n"
- "mov v25.16b, v0.16b\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov v27.16b, v0.16b\n"
"mov v19.16b, v0.16b\n"
"cmp %x[n_channels], #0x4\n"
"mov x9, #0x0\n"
"mov v18.16b, v0.16b\n"
- "mov v24.16b, v3.16b\n"
+ "mov v26.16b, v3.16b\n"
"mov x28, #0x0\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"mov v17.16b, v3.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x2\n"
+ "mov v16.16b, v3.16b\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x2\n"
"ext v19.16b, v19.16b, v19.16b, #0x4\n"
- "ext v18.16b, v18.16b, v18.16b, #0x6\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "zip1 v1.4s, v1.4s, v16.4s\n"
- "mov v16.16b, v3.16b\n"
- "zip1 v29.4s, v29.4s, v28.4s\n"
- "zip1 v2.4s, v2.4s, v31.4s\n"
- "zip1 v22.4s, v22.4s, v30.4s\n"
- "ext v24.16b, v24.16b, v24.16b, #0x2\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "zip1 v1.4s, v1.4s, v23.4s\n"
+ "zip1 v28.4s, v28.4s, v30.4s\n"
+ "zip1 v2.4s, v2.4s, v20.4s\n"
+ "zip1 v21.4s, v21.4s, v29.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x2\n"
"ext v17.16b, v17.16b, v17.16b, #0x4\n"
"ext v16.16b, v16.16b, v16.16b, #0x6\n"
- "zip1 v4.4s, v4.4s, v21.4s\n"
- "zip1 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v4.4s, v4.4s, v22.4s\n"
+ "zip1 v9.4s, v9.4s, v31.4s\n"
"zip1 v0.4s, v0.4s, v19.4s\n"
- "zip1 v25.4s, v25.4s, v18.4s\n"
- "zip1 v1.4s, v1.4s, v29.4s\n"
- "zip1 v2.4s, v2.4s, v22.4s\n"
- ".inst 0x6f81e1fa // udot v26.4s, v15.16b, v1.4b[0]\n"
+ "zip1 v27.4s, v27.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v21.4s\n"
+ ".inst 0x6f81e118 // udot v24.4s, v8.16b, v1.4b[0]\n"
"zip1 v3.4s, v3.4s, v17.4s\n"
- "zip1 v24.4s, v24.4s, v16.4s\n"
- ".inst 0x6fa1e1fb // udot v27.4s, v15.16b, v1.4b[1]\n"
- "zip1 v4.4s, v4.4s, v23.4s\n"
+ "zip1 v26.4s, v26.4s, v16.4s\n"
+ ".inst 0x6fa1e119 // udot v25.4s, v8.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v9.4s\n"
"movi v23.4s, #0x0\n"
- ".inst 0x6f81e9f7 // udot v23.4s, v15.16b, v1.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
"movi v22.4s, #0x0\n"
"movi v21.4s, #0x0\n"
- ".inst 0x6fa1e9f6 // udot v22.4s, v15.16b, v1.4b[3]\n"
- "movi v20.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x6f82e1f5 // udot v21.4s, v15.16b, v2.4b[0]\n"
- "movi v8.4s, #0x0\n"
+ ".inst 0x6fa1e916 // udot v22.4s, v8.16b, v1.4b[3]\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6fa2e1f4 // udot v20.4s, v15.16b, v2.4b[1]\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x6f82e115 // udot v21.4s, v8.16b, v2.4b[0]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x6fa2e113 // udot v19.4s, v8.16b, v2.4b[1]\n"
"movi v18.4s, #0x0\n"
"movi v17.4s, #0x0\n"
- ".inst 0x6f82e9e9 // udot v9.4s, v15.16b, v2.4b[2]\n"
+ ".inst 0x6f82e909 // udot v9.4s, v8.16b, v2.4b[2]\n"
"movi v16.4s, #0x0\n"
- "zip1 v0.4s, v0.4s, v25.4s\n"
- ".inst 0x6fa2e9e8 // udot v8.4s, v15.16b, v2.4b[3]\n"
- "zip1 v3.4s, v3.4s, v24.4s\n"
- ".inst 0x6f84e1f3 // udot v19.4s, v15.16b, v4.4b[0]\n"
- ".inst 0x6fa4e1f2 // udot v18.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6f84e9f1 // udot v17.4s, v15.16b, v4.4b[2]\n"
- ".inst 0x6fa4e9f0 // udot v16.4s, v15.16b, v4.4b[3]\n"
+ "zip1 v0.4s, v0.4s, v27.4s\n"
+ ".inst 0x6fa2e90a // udot v10.4s, v8.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v26.4s\n"
+ ".inst 0x6f84e114 // udot v20.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e112 // udot v18.4s, v8.16b, v4.4b[1]\n"
+ ".inst 0x6f84e911 // udot v17.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e910 // udot v16.4s, v8.16b, v4.4b[3]\n"
"movi v31.4s, #0x0\n"
"movi v30.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- ".inst 0x6f80e1ff // udot v31.4s, v15.16b, v0.4b[0]\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x6f80e11f // udot v31.4s, v8.16b, v0.4b[0]\n"
+ "movi v27.4s, #0x0\n"
"movi v28.4s, #0x0\n"
- ".inst 0x6fa0e1fe // udot v30.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6f80e9fd // udot v29.4s, v15.16b, v0.4b[2]\n"
- ".inst 0x6fa0e9fc // udot v28.4s, v15.16b, v0.4b[3]\n"
- "add v24.4s, v26.4s, v21.4s\n"
- "add v25.4s, v27.4s, v20.4s\n"
- "add v26.4s, v23.4s, v9.4s\n"
- "add v27.4s, v22.4s, v8.4s\n"
- "add v23.4s, v19.4s, v21.4s\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x6f83e1f6 // udot v22.4s, v15.16b, v3.4b[0]\n"
- "add v21.4s, v18.4s, v20.4s\n"
+ ".inst 0x6fa0e11e // udot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x6f80e91a // udot v26.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e91b // udot v27.4s, v8.16b, v0.4b[3]\n"
+ ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e11d // udot v29.4s, v8.16b, v3.4b[1]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "add v21.4s, v20.4s, v21.4s\n"
"movi v20.4s, #0x0\n"
- ".inst 0x6fa3e1f4 // udot v20.4s, v15.16b, v3.4b[1]\n"
- "add v19.4s, v17.4s, v9.4s\n"
+ ".inst 0x6f83e914 // udot v20.4s, v8.16b, v3.4b[2]\n"
+ "add v19.4s, v18.4s, v19.4s\n"
"movi v18.4s, #0x0\n"
- ".inst 0x6f83e9f2 // udot v18.4s, v15.16b, v3.4b[2]\n"
- "add v17.4s, v16.4s, v8.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6fa3e9f0 // udot v16.4s, v15.16b, v3.4b[3]\n"
+ ".inst 0x6fa3e912 // udot v18.4s, v8.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v9.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
"add v24.4s, v24.4s, v31.4s\n"
"add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v27.4s, v27.4s, v28.4s\n"
- "add v28.4s, v23.4s, v22.4s\n"
- "add v29.4s, v21.4s, v20.4s\n"
- "add v30.4s, v19.4s, v18.4s\n"
- "add v31.4s, v17.4s, v16.4s\n"
- "neg v13.4s, v13.4s\n"
- "mul v24.4s, v24.4s, v13.4s\n"
- "mul v25.4s, v25.4s, v13.4s\n"
- "mul v26.4s, v26.4s, v13.4s\n"
- "mul v27.4s, v27.4s, v13.4s\n"
- "mul v28.4s, v28.4s, v13.4s\n"
- "mul v29.4s, v29.4s, v13.4s\n"
- "mul v30.4s, v30.4s, v13.4s\n"
- "mul v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v23.4s, v26.4s\n"
+ "add v27.4s, v22.4s, v27.4s\n"
+ "add v28.4s, v21.4s, v28.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v17.4s, v20.4s\n"
+ "add v31.4s, v16.4s, v18.4s\n"
+ "neg v12.4s, v12.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
"zip1 v19.4s, v24.4s, v26.4s\n"
"zip1 v18.4s, v25.4s, v27.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
"zip1 v16.4s, v29.4s, v31.4s\n"
"zip1 v22.4s, v19.4s, v18.4s\n"
"zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x0]\n"
- "ldr q20, [%x[params], #0x10]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "ldr q21, [%x[params], #0x10]\n"
".inst 0x6f80e0b8 // udot v24.4s, v5.16b, v0.4b[0]\n"
".inst 0x6fa0e0b9 // udot v25.4s, v5.16b, v0.4b[1]\n"
- "ldr q14, [%x[params], #0x20]\n"
+ "ldr q20, [%x[params], #0x20]\n"
".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -219,43 +220,43 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr q5, [%x[params], #0x30]\n"
".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
"ldr q6, [%x[params], #0x40]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v20.16b\n"
+ "and v19.16b, v24.16b, v21.16b\n"
".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
"ldr q7, [%x[params], #0x50]\n"
- "and v18.16b, v25.16b, v20.16b\n"
- "and v17.16b, v26.16b, v20.16b\n"
- "and v16.16b, v27.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
"add %x[params], %x[params], #0x60\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
@@ -264,38 +265,38 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -329,14 +330,14 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x0]\n"
@@ -415,30 +416,30 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
- "smin v24.4s, v24.4s, v10.4s\n"
- "smin v25.4s, v25.4s, v10.4s\n"
- "smin v26.4s, v26.4s, v10.4s\n"
- "smin v27.4s, v27.4s, v10.4s\n"
- "smin v28.4s, v28.4s, v10.4s\n"
- "smin v29.4s, v29.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v10.4s\n"
- "smin v31.4s, v31.4s, v10.4s\n"
- "smax v24.4s, v24.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v11.4s\n"
- "smax v27.4s, v27.4s, v11.4s\n"
- "smax v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v11.4s\n"
- "smax v31.4s, v31.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -514,4 +515,5 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index b575a5d169..4485aaa735 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -57,4 +57,5 @@ struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst :
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 027cc9e5a2..61cec2b66d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -42,133 +43,133 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
__asm__ __volatile__(
"ldr q12, [%x[params], #0x0]\n"
"ldr q8, [%x[params], #0x10]\n"
- "movi v28.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
+ "movi v30.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
"ldr q9, [%x[params], #0x20]\n"
"ldr q10, [%x[params], #0x30]\n"
- "movi v31.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
"ldr q11, [%x[params], #0x40]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "movi v30.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"ld1 { v3.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "mov v16.16b, v3.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "mov v26.16b, v3.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v4.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
- "mov v15.16b, v4.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ "mov v21.16b, v4.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
"ld1 { v2.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x8]\n"
- "mov v20.16b, v2.16b\n"
- "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "mov v27.16b, v2.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
"ld1 { v1.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x28]\n"
- "zip1 v3.2d, v3.2d, v16.2d\n"
- "zip1 v4.2d, v4.2d, v15.2d\n"
+ "zip1 v3.2d, v3.2d, v26.2d\n"
+ "zip1 v4.2d, v4.2d, v21.2d\n"
"ld1 { v5.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x30]\n"
"mov v26.16b, v1.16b\n"
- "mov v13.16b, v5.16b\n"
+ "mov v22.16b, v5.16b\n"
"ld1 { v6.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x38]\n"
"mov v19.16b, v6.16b\n"
"ext v26.16b, v26.16b, v26.16b, #0x1\n"
"ld1 { v7.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x0]\n"
- "mov v17.16b, v7.16b\n"
- "zip1 v2.2d, v2.2d, v20.2d\n"
+ "mov v21.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v27.2d\n"
"ld1 { v0.16b }, [x20]\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
"ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x6f83e392 // udot v18.4s, v28.16b, v3.4b[0]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- ".inst 0x6f83eb9f // udot v31.4s, v28.16b, v3.4b[2]\n"
- ".inst 0x6f84e398 // udot v24.4s, v28.16b, v4.4b[0]\n"
+ ".inst 0x6f83e3d1 // udot v17.4s, v30.16b, v3.4b[0]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6f83ebd0 // udot v16.4s, v30.16b, v3.4b[2]\n"
+ ".inst 0x6f84e3d9 // udot v25.4s, v30.16b, v4.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v23.4s }, [x20]\n"
- ".inst 0x6f84eb9e // udot v30.4s, v28.16b, v4.4b[2]\n"
- "mov v16.16b, v0.16b\n"
- ".inst 0x6f82e395 // udot v21.4s, v28.16b, v2.4b[0]\n"
- "movi v20.4s, #0x0\n"
- "movi v29.4s, #0x1\n"
- ".inst 0x6f82eb94 // udot v20.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f84ebd8 // udot v24.4s, v30.16b, v4.4b[2]\n"
+ "mov v18.16b, v0.16b\n"
+ ".inst 0x6f82e3df // udot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v28.4s, #0x1\n"
+ ".inst 0x6f82ebdd // udot v29.4s, v30.16b, v2.4b[2]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
"zip1 v1.2d, v1.2d, v26.2d\n"
- ".inst 0x6fa3e3b2 // udot v18.4s, v29.16b, v3.4b[1]\n"
- "zip1 v5.2d, v5.2d, v13.2d\n"
+ ".inst 0x6fa3e391 // udot v17.4s, v28.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v22.2d\n"
"zip1 v6.2d, v6.2d, v19.2d\n"
- ".inst 0x6fa3ebbf // udot v31.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa3eb90 // udot v16.4s, v28.16b, v3.4b[3]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
- "zip1 v7.2d, v7.2d, v17.2d\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v21.2d\n"
"movi v22.4s, #0x0\n"
- ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
- "movi v26.4s, #0x0\n"
- ".inst 0x6fa4ebbe // udot v30.4s, v29.16b, v4.4b[3]\n"
- ".inst 0x6f81e396 // udot v22.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6fa4eb98 // udot v24.4s, v28.16b, v4.4b[3]\n"
+ ".inst 0x6f81e3d6 // udot v22.4s, v30.16b, v1.4b[0]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
"ld1r { v15.4s }, [x20]\n"
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- ".inst 0x6f81eb9a // udot v26.4s, v28.16b, v1.4b[2]\n"
- "zip1 v0.2d, v0.2d, v16.2d\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ "movi v20.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6f85e399 // udot v25.4s, v28.16b, v5.4b[0]\n"
+ ".inst 0x6f85e3da // udot v26.4s, v30.16b, v5.4b[0]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x6f85eb9b // udot v27.4s, v28.16b, v5.4b[2]\n"
- ".inst 0x6f86e393 // udot v19.4s, v28.16b, v6.4b[0]\n"
- "add v24.4s, v18.4s, v24.4s\n"
- "mov x9, #0x0\n"
+ "zip1 v0.2d, v0.2d, v18.2d\n"
"movi v18.4s, #0x0\n"
- ".inst 0x6f86eb92 // udot v18.4s, v28.16b, v6.4b[2]\n"
- ".inst 0x6fa2e3b5 // udot v21.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6f85ebdb // udot v27.4s, v30.16b, v5.4b[2]\n"
+ "mov x9, #0x0\n"
+ ".inst 0x6f86e3d4 // udot v20.4s, v30.16b, v6.4b[0]\n"
+ ".inst 0x6f86ebd3 // udot v19.4s, v30.16b, v6.4b[2]\n"
+ "add v17.4s, v17.4s, v25.4s\n"
"mov x28, #0x0\n"
- ".inst 0x6fa2ebb4 // udot v20.4s, v29.16b, v2.4b[3]\n"
- "add v17.4s, v31.4s, v30.4s\n"
- ".inst 0x6fa1e3b6 // udot v22.4s, v29.16b, v1.4b[1]\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6f87e3d2 // udot v18.4s, v30.16b, v7.4b[0]\n"
+ ".inst 0x6f87ebd9 // udot v25.4s, v30.16b, v7.4b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f87e390 // udot v16.4s, v28.16b, v7.4b[0]\n"
- ".inst 0x6fa1ebba // udot v26.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e39f // udot v31.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa2eb9d // udot v29.4s, v28.16b, v2.4b[3]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- ".inst 0x6fa5e3b9 // udot v25.4s, v29.16b, v5.4b[1]\n"
- ".inst 0x6fa5ebbb // udot v27.4s, v29.16b, v5.4b[3]\n"
- "add v30.4s, v21.4s, v24.4s\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x6f80e3d8 // udot v24.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x6fa1e396 // udot v22.4s, v28.16b, v1.4b[1]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- ".inst 0x6fa6e3b3 // udot v19.4s, v29.16b, v6.4b[1]\n"
- ".inst 0x6fa6ebb2 // udot v18.4s, v29.16b, v6.4b[3]\n"
- "add v31.4s, v20.4s, v17.4s\n"
+ ".inst 0x6fa1eb95 // udot v21.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa5e39a // udot v26.4s, v28.16b, v5.4b[1]\n"
+ "add v31.4s, v31.4s, v17.4s\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- ".inst 0x6fa7e3b0 // udot v16.4s, v29.16b, v7.4b[1]\n"
- "add v22.4s, v22.4s, v30.4s\n"
+ ".inst 0x6fa5eb9b // udot v27.4s, v28.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e394 // udot v20.4s, v28.16b, v6.4b[1]\n"
+ "add v29.4s, v29.4s, v16.4s\n"
"add %x[params], %x[params], #0x50\n"
- "add v21.4s, v26.4s, v31.4s\n"
- "add v20.4s, v25.4s, v19.4s\n"
- "add v19.4s, v27.4s, v18.4s\n"
- "add v18.4s, v16.4s, v24.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f87eb90 // udot v16.4s, v28.16b, v7.4b[2]\n"
- ".inst 0x6fa7ebb0 // udot v16.4s, v29.16b, v7.4b[3]\n"
- "add v17.4s, v16.4s, v17.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f80e390 // udot v16.4s, v28.16b, v0.4b[0]\n"
- ".inst 0x6fa0e3b0 // udot v16.4s, v29.16b, v0.4b[1]\n"
- "add v24.4s, v22.4s, v16.4s\n"
- "add v26.4s, v22.4s, v25.4s\n"
- "movi v16.4s, #0x0\n"
- ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
- ".inst 0x6fa0ebb0 // udot v16.4s, v29.16b, v0.4b[3]\n"
- "add v25.4s, v21.4s, v16.4s\n"
- "add v27.4s, v21.4s, v27.4s\n"
- "add v28.4s, v20.4s, v30.4s\n"
- "add v29.4s, v19.4s, v31.4s\n"
- "add v30.4s, v18.4s, v20.4s\n"
- "add v31.4s, v17.4s, v19.4s\n"
+ ".inst 0x6fa6eb93 // udot v19.4s, v28.16b, v6.4b[3]\n"
+ ".inst 0x6fa7e392 // udot v18.4s, v28.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v31.4s\n"
+ ".inst 0x6fa7eb99 // udot v25.4s, v28.16b, v7.4b[3]\n"
+ ".inst 0x6fa0e398 // udot v24.4s, v28.16b, v0.4b[1]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v20.4s, v26.4s, v20.4s\n"
+ "add v19.4s, v27.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x6fa0eb91 // udot v17.4s, v28.16b, v0.4b[3]\n"
+ "add v16.4s, v25.4s, v16.4s\n"
+ "add v24.4s, v22.4s, v24.4s\n"
+ "add v25.4s, v21.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v20.4s, v31.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v18.4s\n"
+ "add v31.4s, v19.4s, v16.4s\n"
"neg v23.4s, v23.4s\n"
"mul v24.4s, v24.4s, v23.4s\n"
"mul v25.4s, v25.4s, v23.4s\n"
@@ -194,11 +195,11 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add v31.4s, v31.4s, v12.4s\n"
"ble 2f\n"
"1:" // Loop
- "ldr q21, [%x[params], #0x60]\n"
- "ldr q20, [%x[params], #0x70]\n"
+ "ldr q12, [%x[params], #0x60]\n"
+ "ldr q21, [%x[params], #0x70]\n"
".inst 0x6f80e118 // udot v24.4s, v8.16b, v0.4b[0]\n"
".inst 0x6f80e919 // udot v25.4s, v8.16b, v0.4b[2]\n"
- "ldr q12, [%x[params], #0x80]\n"
+ "ldr q20, [%x[params], #0x80]\n"
".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
@@ -212,7 +213,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
@@ -221,7 +222,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
@@ -230,115 +231,115 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e11a // udot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e91b // udot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6fa2e939 // udot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa3e13a // udot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x6fa3e93b // udot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x6f84e11c // udot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e91d // udot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11e // udot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91f // udot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x6f83e158 // udot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f83e959 // udot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f84e15a // udot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f84e95b // udot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6fa4e13c // udot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e93d // udot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x6fa5e13e // udot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93f // udot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
- ".inst 0x6fa3e178 // udot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x6fa3e979 // udot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x6fa4e17a // udot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x6fa4e97b // udot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x6f85e15c // udot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e95d // udot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e15e // udot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e95f // udot v31.4s, v10.16b, v6.4b[2]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x6f84e118 // udot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11a // udot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6fa5e17c // udot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e97d // udot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x6fa6e17e // udot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e97f // udot v31.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
"ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x6fa4e138 // udot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x6fa5e13a // udot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x6f86e11c // udot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e91d // udot v29.4s, v8.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x6f87e11e // udot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e91f // udot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ ".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
"ldr q8, [%x[params], #0x90]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x6fa6e13c // udot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e93d // udot v29.4s, v9.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x6fa7e13e // udot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e93f // udot v31.4s, v9.16b, v7.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
- "and v18.16b, v25.16b, v20.16b\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0xd0\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "add %x[params], %x[params], #0xd0\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
- "and v17.16b, v26.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v19.4s\n"
- "and v16.16b, v27.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v30.16b, v20.16b\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v19.4s\n"
- "and v16.16b, v31.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
"sqadd v30.4s, v30.4s, v17.4s\n"
"sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v27.4s, v27.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -347,14 +348,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -388,14 +389,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"dup v30.4s, v23.s[2]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
"bgt 1b\n"
"2:" // Tail
"ldr q21, [%x[params], #0x60]\n"
@@ -420,7 +421,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"add x21, x21, x28\n"
".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
+ "ldr q17, [%x[params], #0x0]\n"
"add x20, x20, x28\n"
".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
@@ -430,7 +431,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
- "ldr q9, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x10]\n"
".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
@@ -439,68 +440,68 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
- "ldr q10, [%x[params], #0x20]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e11a // udot v26.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e91b // udot v27.4s, v8.16b, v3.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
- "ldr q11, [%x[params], #0x30]\n"
- ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6fa2e939 // udot v25.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa3e13a // udot v26.4s, v9.16b, v3.4b[1]\n"
- ".inst 0x6fa3e93b // udot v27.4s, v9.16b, v3.4b[3]\n"
- ".inst 0x6f84e11c // udot v28.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e91d // udot v29.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11e // udot v30.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91f // udot v31.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%x[params], #0x40]\n"
- ".inst 0x6f83e158 // udot v24.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f83e959 // udot v25.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f84e15a // udot v26.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f84e95b // udot v27.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6fa4e13c // udot v28.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e93d // udot v29.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x6fa5e13e // udot v30.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93f // udot v31.4s, v9.16b, v5.4b[3]\n"
- "ldr q9, [%x[params], #0x50]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x6fa3e178 // udot v24.4s, v11.16b, v3.4b[1]\n"
- ".inst 0x6fa3e979 // udot v25.4s, v11.16b, v3.4b[3]\n"
- ".inst 0x6fa4e17a // udot v26.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x6fa4e97b // udot v27.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x6f85e15c // udot v28.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e95d // udot v29.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e15e // udot v30.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e95f // udot v31.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f84e118 // udot v24.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f85e11a // udot v26.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6fa5e17c // udot v28.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e97d // udot v29.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x6fa6e17e // udot v30.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e97f // udot v31.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x6fa4e138 // udot v24.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
"sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x6fa5e13a // udot v26.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
"sqrdmulh v25.4s, v25.4s, v21.4s\n"
- ".inst 0x6f86e11c // udot v28.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e91d // udot v29.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
"sqrdmulh v26.4s, v26.4s, v21.4s\n"
- ".inst 0x6f87e11e // udot v30.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e91f // udot v31.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
"sqrdmulh v27.4s, v27.4s, v21.4s\n"
- ".inst 0x6fa6e13c // udot v28.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e93d // udot v29.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
"and v19.16b, v24.16b, v20.16b\n"
- ".inst 0x6fa7e13e // udot v30.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e93f // udot v31.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
@@ -536,14 +537,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"srshl v29.4s, v29.4s, v20.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
"smin v24.4s, v24.4s, v15.4s\n"
"smin v25.4s, v25.4s, v15.4s\n"
"smin v26.4s, v26.4s, v15.4s\n"
@@ -552,14 +553,14 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"smin v29.4s, v29.4s, v15.4s\n"
"smin v30.4s, v30.4s, v15.4s\n"
"smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -635,4 +636,5 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 13f903b95d..1f2d211be2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index bbb817a883..0770c126ec 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -47,21 +48,21 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
__asm__ __volatile__(
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v3.16b }, [x20]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v12.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v10.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
@@ -89,256 +90,256 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_mul_ptr], 3f\n"
"lsl x20, x9, #0x2\n"
"ldr q9, [%x[rq_mul_ptr], x20]\n"
- "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 7f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -347,263 +348,263 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"6:" // Output channel loop: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldr d7, [x28, #0x0]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -612,224 +613,224 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"7:" // Output channel loop: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -838,62 +839,62 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
"cmp x9, x10, LSL #2\n"
@@ -936,354 +937,354 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 22f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v16.4s, v16.4s, v15.4s\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -1317,158 +1318,156 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[2], [x20]\n"
- "st1 { v25.b }[2], [x21]\n"
- "st1 { v26.b }[2], [x22]\n"
- "st1 { v27.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x24]\n"
- "st1 { v29.b }[2], [x25]\n"
- "st1 { v30.b }[2], [x26]\n"
- "st1 { v31.b }[2], [x27]\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[0], [x20]\n"
- "st1 { v25.b }[0], [x21]\n"
- "st1 { v26.b }[0], [x22]\n"
- "st1 { v27.b }[0], [x23]\n"
- "st1 { v28.b }[0], [x24]\n"
- "st1 { v29.b }[0], [x25]\n"
- "st1 { v30.b }[0], [x26]\n"
- "st1 { v31.b }[0], [x27]\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
-
"26:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -1477,4 +1476,5 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 55731060f4..20a37b157f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,8 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -35,16 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const
-);
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index afc6695ff1..d1872c90f8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1070 +91,1070 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
+ "lsr x15, x16, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v18.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v20.8h }, [x21]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v5.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "mov x17, #0x0\n"
- "ld1r { v13.8h }, [x20]\n"
- "mov x16, #0x0\n"
- "add x15, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "ldr q14, [x27, #0x0]\n"
- "ldr q11, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "str x27, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "mov v16.16b, v14.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d31, [x24, x17]\n"
- "ldr d30, [x23, x17]\n"
- "mov v24.16b, v14.16b\n"
- "mov v17.16b, v11.16b\n"
- "ldr d29, [x22, x17]\n"
- "ldr d28, [x21, x17]\n"
- "mov v23.16b, v14.16b\n"
- "mov v25.16b, v11.16b\n"
- "ldr x20, [x15, #0x20]\n"
- "ldr d27, [x20, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "mov x13, #0x0\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x28, x27, [x22, #0x0]\n"
+ "ldp x26, x25, [x22, #0x10]\n"
+ "cbz x15, 3f\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "subs x15, x15, #0x1\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d23, [x23, x14]\n"
+ "ldr d10, [x22, x14]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d11, [x21, x14]\n"
+ "ldr d13, [x20, x14]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr x20, [x12, #0x20]\n"
+ "ldr d27, [x20, x14]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q22, [x13, #0x0]\n"
- "ldr q10, [x12, #0x0]\n"
- "smlal v14.4s, v31.4h, v4.4h\n"
- "smlal2 v11.4s, v31.8h, v4.8h\n"
- "ldr q18, [x13, #0x10]\n"
- "ldr q26, [x12, #0x10]\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v11.4s, v30.8h, v0.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x38]\n"
- "smlal v16.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "ldr d29, [x20, x17]\n"
- "smlal v24.4s, v31.4h, v1.4h\n"
- "smlal2 v17.4s, v31.8h, v1.8h\n"
- "ldr x26, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr d30, [x20, x17]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v25.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x17]\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x25, [x15, #0x50]\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x58]\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v17.4s, v28.8h, v2.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x23, [x15, #0x60]\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v25.4s, v28.8h, v1.8h\n"
- "ldr d28, [x21, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v7.4h\n"
- "smlal2 v11.4s, v27.8h, v7.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v16.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v24.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr d31, [x26, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v25.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x48\n"
- "subs x8, x8, #0x1\n"
- "smlal v14.4s, v28.4h, v1.4h\n"
- "smlal2 v11.4s, v28.8h, v1.8h\n"
- "add x13, x13, #0x20\n"
- "add x12, x12, #0x20\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v24.4s, v27.4h, v4.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v27.8h, v4.8h\n"
- "smlal2 v25.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v11.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v16.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal2 v17.4s, v30.8h, v5.8h\n"
- "smlal2 v25.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x17]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v24.4s, v29.4h, v0.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal v14.4s, v29.4h, v3.4h\n"
- "smlal2 v11.4s, v29.8h, v3.8h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x17]\n"
- "smlal2 v25.4s, v28.8h, v2.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v3.4h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal v16.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "sqrdmulh v14.4s, v14.4s, v22.4s\n"
- "add x17, x17, #0x8\n"
- "smlal2 v25.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v29.4h, v7.4h\n"
- "and v21.16b, v14.16b, v10.16b\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v11.4s, v31.8h, v6.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v17.4s, v29.8h, v7.8h\n"
- "smlal2 v25.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v16.4s, v30.4h, v8.4h\n"
- "smlal v24.4s, v28.4h, v8.4h\n"
- "and v4.16b, v11.16b, v26.16b\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "sqrdmulh v16.4s, v16.4s, v22.4s\n"
- "smlal2 v17.4s, v28.8h, v8.8h\n"
- "smlal2 v25.4s, v28.8h, v7.8h\n"
- "sqrdmulh v24.4s, v24.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "sqadd v14.4s, v14.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v16.16b, v10.16b\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "and v3.16b, v24.16b, v10.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "and v21.16b, v23.16b, v10.16b\n"
- "sqrdmulh v25.4s, v25.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v27.16b, v9.16b, v26.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v5.16b, v17.16b, v26.16b\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr q20, [x10, #0x10]\n"
+ "ldr q26, [x9, #0x10]\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ldr x20, [x12, #0x38]\n"
+ "ldr d10, [x20, x14]\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr d15, [x20, x14]\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "ldr d23, [x20, x14]\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "ldr x20, [x12, #0x48]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "ldr d11, [x20, x14]\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v0.4s, v21.4h, v29.4h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x24, [x12, #0x60]\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "smlal v28.4s, v10.4h, v7.4h\n"
+ "ldr x23, [x12, #0x68]\n"
+ "ldr x22, [x12, #0x70]\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "ldr d13, [x21, x14]\n"
+ "smlal2 v22.4s, v21.8h, v29.8h\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ldr x21, [x12, #0x78]\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal v6.4s, v15.4h, v4.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v9.4s, v10.8h, v7.8h\n"
+ "smlal v28.4s, v23.4h, v1.4h\n"
+ "add x11, x11, #0x48\n"
+ "subs x15, x15, #0x1\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d27, [x24, x14]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v2.4s, v15.8h, v4.8h\n"
+ "ldr d15, [x23, x14]\n"
+ "smlal v3.4s, v10.4h, v19.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v0.4s, v11.4h, v31.4h\n"
+ "smlal v6.4s, v11.4h, v8.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v28.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v10.8h, v19.8h\n"
+ "ldr d10, [x22, x14]\n"
+ "smlal2 v22.4s, v11.8h, v31.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v2.4s, v11.8h, v8.8h\n"
+ "ldr d8, [x21, x14]\n"
+ "smlal v3.4s, v23.4h, v7.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v13.4h, v19.4h\n"
+ "smlal v6.4s, v21.4h, v1.4h\n"
+ "add x14, x14, #0x8\n"
+ "smlal2 v9.4s, v11.8h, v4.8h\n"
+ "smlal v28.4s, v13.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v7.8h\n"
+ "smlal2 v22.4s, v13.8h, v19.8h\n"
+ "smlal2 v2.4s, v21.8h, v1.8h\n"
+ "smlal v3.4s, v11.4h, v16.4h\n"
+ "smlal v0.4s, v27.4h, v17.4h\n"
+ "smlal v6.4s, v15.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v17.8h\n"
+ "smlal v28.4s, v27.4h, v29.4h\n"
+ "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+ "smlal2 v30.4s, v11.8h, v16.8h\n"
+ "smlal2 v22.4s, v27.8h, v17.8h\n"
+ "and v17.16b, v28.16b, v25.16b\n"
+ "smlal2 v2.4s, v15.8h, v31.8h\n"
+ "smlal v3.4s, v21.4h, v31.4h\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smlal v0.4s, v10.4h, v16.4h\n"
+ "smlal v6.4s, v10.4h, v29.4h\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "smlal2 v9.4s, v27.8h, v29.8h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v20.4s\n"
+ "smlal2 v22.4s, v10.8h, v16.8h\n"
+ "smlal2 v2.4s, v10.8h, v29.8h\n"
+ "and v23.16b, v9.16b, v26.16b\n"
+ "smlal v3.4s, v15.4h, v4.4h\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "sqrdmulh v3.4s, v3.4s, v24.4s\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v30.4s, v15.8h, v4.8h\n"
+ "sqrdmulh v0.4s, v0.4s, v24.4s\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "sqrdmulh v6.4s, v6.4s, v24.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v8.16b, v3.16b, v25.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "and v11.16b, v0.16b, v25.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v29.16b, v6.16b, v25.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v20.4s\n"
+ "sqadd v9.4s, v9.4s, v23.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v13.16b, v30.16b, v26.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v22.16b, v26.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v23.16b, v2.16b, v26.16b\n"
+ "sqadd v3.4s, v3.4s, v8.4s\n"
+ "sshr v13.4s, v13.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v4.16b, v25.16b, v26.16b\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v3.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v14.4s, v14.4s, v10.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v26.4s\n"
- "sqxtn v14.4h, v14.4s\n"
+ "sqadd v6.4s, v6.4s, v29.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v25.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqadd v30.4s, v30.4s, v13.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v21.4s\n"
+ "srshl v6.4s, v6.4s, v25.4s\n"
+ "sqadd v2.4s, v2.4s, v23.4s\n"
"srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v17.4s, v17.4s, v26.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v25.4s, v25.4s, v26.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v14.8h, v11.4s\n"
- "sqxtn2 v16.8h, v9.4s\n"
- "sqxtn2 v24.8h, v17.4s\n"
- "sqxtn2 v23.8h, v25.4s\n"
- "sqadd v14.8h, v14.8h, v20.8h\n"
- "sqadd v16.8h, v16.8h, v20.8h\n"
- "sqadd v24.8h, v24.8h, v20.8h\n"
- "sqadd v23.8h, v23.8h, v20.8h\n"
- "smax v14.8h, v14.8h, v15.8h\n"
- "smax v16.8h, v16.8h, v15.8h\n"
- "smax v24.8h, v24.8h, v15.8h\n"
- "smax v23.8h, v23.8h, v15.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v16.8h, v16.8h, v13.8h\n"
- "smin v24.8h, v24.8h, v13.8h\n"
- "smin v23.8h, v23.8h, v13.8h\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d14, [x11, x16]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d16, [x10, x16]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x9, x16]\n"
- "str d23, [x28, x16]\n"
- "ldr q14, [x27, #0x0]\n"
- "ldr q11, [x27, #0x10]\n"
- "add x27, x27, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "add x16, x16, #0x8\n"
- "str x27, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v16.16b, v14.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v24.16b, v14.16b\n"
- "mov v17.16b, v11.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v23.16b, v14.16b\n"
- "mov v25.16b, v11.16b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ldr d31, [x24, x17]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr d30, [x23, x17]\n"
- "ldr d29, [x22, x17]\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "ldr d28, [x21, x17]\n"
- "ldr x20, [x15, #0x20]\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "ldr d27, [x20, x17]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v26.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str d28, [x28, x13]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "str d3, [x27, x13]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d0, [x26, x13]\n"
+ "str d6, [x25, x13]\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "add x13, x13, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ldr d23, [x23, x14]\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d10, [x22, x14]\n"
+ "ldr d11, [x21, x14]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d13, [x20, x14]\n"
+ "ldr x20, [x12, #0x20]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d27, [x20, x14]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q22, [x13, #0x0]\n"
- "ldr q10, [x12, #0x0]\n"
- "smlal v14.4s, v31.4h, v4.4h\n"
- "smlal2 v11.4s, v31.8h, v4.8h\n"
- "ldr q18, [x13, #0x10]\n"
- "ldr q26, [x12, #0x10]\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v11.4s, v30.8h, v0.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x38]\n"
- "smlal v16.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "ldr d29, [x20, x17]\n"
- "smlal v24.4s, v31.4h, v1.4h\n"
- "smlal2 v17.4s, v31.8h, v1.8h\n"
- "ldr x26, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr d30, [x20, x17]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v25.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x17]\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x25, [x15, #0x50]\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x58]\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v17.4s, v28.8h, v2.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x23, [x15, #0x60]\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v25.4s, v28.8h, v1.8h\n"
- "ldr d28, [x21, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v7.4h\n"
- "smlal2 v11.4s, v27.8h, v7.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v16.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "tst x7, #0x7\n"
- "smlal v24.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr d31, [x26, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v25.4s, v27.8h, v3.8h\n"
- "add x13, x13, #0x20\n"
- "add x12, x12, #0x20\n"
- "smlal v14.4s, v28.4h, v1.4h\n"
- "smlal2 v11.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v24.4s, v27.4h, v4.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v27.8h, v4.8h\n"
- "smlal2 v25.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v11.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v16.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal2 v17.4s, v30.8h, v5.8h\n"
- "smlal2 v25.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x17]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v24.4s, v29.4h, v0.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal v14.4s, v29.4h, v3.4h\n"
- "smlal2 v11.4s, v29.8h, v3.8h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x17]\n"
- "smlal2 v25.4s, v28.8h, v2.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v3.4h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal v16.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "sqrdmulh v14.4s, v14.4s, v22.4s\n"
- "add x17, x17, #0x8\n"
- "smlal2 v25.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v29.4h, v7.4h\n"
- "and v21.16b, v14.16b, v10.16b\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v11.4s, v31.8h, v6.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v17.4s, v29.8h, v7.8h\n"
- "smlal2 v25.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v16.4s, v30.4h, v8.4h\n"
- "smlal v24.4s, v28.4h, v8.4h\n"
- "and v4.16b, v11.16b, v26.16b\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "sqrdmulh v16.4s, v16.4s, v22.4s\n"
- "smlal2 v17.4s, v28.8h, v8.8h\n"
- "smlal2 v25.4s, v28.8h, v7.8h\n"
- "sqrdmulh v24.4s, v24.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "sqadd v14.4s, v14.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v16.16b, v10.16b\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "and v3.16b, v24.16b, v10.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "and v21.16b, v23.16b, v10.16b\n"
- "sqrdmulh v25.4s, v25.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v27.16b, v9.16b, v26.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v5.16b, v17.16b, v26.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v4.16b, v25.16b, v26.16b\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr q24, [x10, #0x10]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ldr x20, [x12, #0x38]\n"
+ "ldr d15, [x20, x14]\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr d10, [x20, x14]\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "ldr d23, [x20, x14]\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "ldr x20, [x12, #0x48]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "ldr d11, [x20, x14]\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v0.4s, v21.4h, v29.4h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x23, [x12, #0x60]\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "smlal v28.4s, v15.4h, v7.4h\n"
+ "ldr x22, [x12, #0x68]\n"
+ "ldr x21, [x12, #0x70]\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "ldr d13, [x24, x14]\n"
+ "smlal2 v22.4s, v21.8h, v29.8h\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ldr x20, [x12, #0x78]\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal v6.4s, v10.4h, v4.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "tst x16, #0x7\n"
+ "smlal2 v9.4s, v15.8h, v7.8h\n"
+ "smlal v28.4s, v23.4h, v1.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d27, [x23, x14]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v2.4s, v10.8h, v4.8h\n"
+ "ldr d10, [x22, x14]\n"
+ "smlal v3.4s, v15.4h, v19.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v0.4s, v11.4h, v31.4h\n"
+ "smlal v6.4s, v11.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v28.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v15.8h, v19.8h\n"
+ "ldr d15, [x21, x14]\n"
+ "smlal2 v22.4s, v11.8h, v31.8h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v2.4s, v11.8h, v8.8h\n"
+ "ldr d8, [x20, x14]\n"
+ "smlal v3.4s, v23.4h, v7.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v13.4h, v19.4h\n"
+ "smlal v6.4s, v21.4h, v1.4h\n"
+ "add x14, x14, #0x8\n"
+ "smlal2 v9.4s, v11.8h, v4.8h\n"
+ "smlal v28.4s, v13.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v7.8h\n"
+ "smlal2 v22.4s, v13.8h, v19.8h\n"
+ "smlal2 v2.4s, v21.8h, v1.8h\n"
+ "smlal v3.4s, v11.4h, v16.4h\n"
+ "smlal v0.4s, v27.4h, v17.4h\n"
+ "smlal v6.4s, v10.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v17.8h\n"
+ "smlal v28.4s, v27.4h, v29.4h\n"
+ "sqrdmulh v28.4s, v28.4s, v26.4s\n"
+ "smlal2 v30.4s, v11.8h, v16.8h\n"
+ "smlal2 v22.4s, v27.8h, v17.8h\n"
+ "and v1.16b, v28.16b, v25.16b\n"
+ "smlal2 v2.4s, v10.8h, v31.8h\n"
+ "smlal v3.4s, v21.4h, v31.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v0.4s, v15.4h, v16.4h\n"
+ "smlal v6.4s, v15.4h, v29.4h\n"
+ "sqadd v28.4s, v28.4s, v1.4s\n"
+ "smlal2 v9.4s, v27.8h, v29.8h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v24.4s\n"
+ "smlal2 v22.4s, v15.8h, v16.8h\n"
+ "smlal2 v2.4s, v15.8h, v29.8h\n"
+ "and v27.16b, v9.16b, v20.16b\n"
+ "smlal v3.4s, v10.4h, v4.4h\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "sqrdmulh v3.4s, v3.4s, v26.4s\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v30.4s, v10.8h, v4.8h\n"
+ "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "sqrdmulh v6.4s, v6.4s, v26.4s\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v3.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v14.4s, v14.4s, v10.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v24.4s\n"
+ "and v4.16b, v0.16b, v25.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "and v17.16b, v6.16b, v25.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v24.4s\n"
"sqadd v9.4s, v9.4s, v27.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v26.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v17.4s, v17.4s, v26.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v25.4s, v25.4s, v26.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v14.8h, v11.4s\n"
- "sqxtn2 v16.8h, v9.4s\n"
- "sqxtn2 v24.8h, v17.4s\n"
- "sqxtn2 v23.8h, v25.4s\n"
- "sqadd v14.8h, v14.8h, v20.8h\n"
- "sqadd v16.8h, v16.8h, v20.8h\n"
- "sqadd v24.8h, v24.8h, v20.8h\n"
- "sqadd v23.8h, v23.8h, v20.8h\n"
- "smax v14.8h, v14.8h, v15.8h\n"
- "smax v16.8h, v16.8h, v15.8h\n"
- "smax v24.8h, v24.8h, v15.8h\n"
- "smax v23.8h, v23.8h, v15.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v16.8h, v16.8h, v13.8h\n"
- "smin v24.8h, v24.8h, v13.8h\n"
- "smin v23.8h, v23.8h, v13.8h\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d14, [x11, x16]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d16, [x10, x16]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x9, x16]\n"
- "str d23, [x28, x16]\n"
- "add x16, x16, #0x8\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v8.16b, v30.16b, v20.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v26.16b, v22.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v11.16b, v2.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v4.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v25.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqadd v30.4s, v30.4s, v8.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v26.4s\n"
+ "srshl v6.4s, v6.4s, v25.4s\n"
+ "sqadd v2.4s, v2.4s, v11.4s\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v20.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v20.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str d28, [x28, x13]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "str d3, [x27, x13]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d0, [x26, x13]\n"
+ "str d6, [x25, x13]\n"
+ "add x13, x13, #0x8\n"
"beq 64f\n"
- "add x14, x14, #0x48\n"
+ "add x11, x11, #0x48\n"
"3:" // Oddments
- "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v14.4s }, [x27], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v11.d }[0], [x27], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v11.s }[2], [x27]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x16, #2, 5f\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "tbz x16, #1, 4f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v11.s }[0], [x27]\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v14.d }[0], [x27], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v14.s }[2], [x27]\n"
+ "tbz x16, #1, 6f\n"
+ "ld1 { v28.d }[0], [x20], #0x8\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v28.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v14.s }[0], [x27]\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v28.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v16.16b, v14.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v24.16b, v14.16b\n"
- "mov v17.16b, v11.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v23.16b, v14.16b\n"
- "mov v25.16b, v11.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "add x24, x24, x17\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldp x24, x23, [x12, #0x0]\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldp x22, x21, [x12, #0x10]\n"
+ "ldr x20, [x12, #0x20]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 9f\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x21], #0x4\n"
"ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x16, #1, 8f\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v13.h }[2], [x21], #0x2\n"
"ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x23]\n"
+ "ld1 { v11.b }[6], [x22]\n"
+ "ld1 { v13.b }[6], [x21]\n"
"ld1 { v27.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x23]\n"
+ "ld1 { v11.b }[4], [x22]\n"
+ "ld1 { v13.b }[4], [x21]\n"
"ld1 { v27.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x16, #1, 10f\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v13.h }[0], [x21], #0x2\n"
"ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x23]\n"
+ "ld1 { v11.b }[2], [x22]\n"
+ "ld1 { v13.b }[2], [x21]\n"
"ld1 { v27.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x23]\n"
+ "ld1 { v11.b }[0], [x22]\n"
+ "ld1 { v13.b }[0], [x21]\n"
"ld1 { v27.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v4.4h\n"
- "smlal2 v11.4s, v31.8h, v4.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "add x22, x22, x17\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v1.4h\n"
- "smlal2 v17.4s, v31.8h, v1.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v25.4s, v31.8h, v0.8h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v11.4s, v30.8h, v0.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v16.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v17.4s, v28.8h, v2.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v25.4s, v28.8h, v1.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "tbz x16, #2, 13f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 12f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x16, #1, 14f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v24.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "smlal v14.4s, v27.4h, v7.4h\n"
- "smlal2 v11.4s, v27.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "smlal v16.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "smlal v24.4s, v27.4h, v4.4h\n"
- "smlal2 v17.4s, v27.8h, v4.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v25.4s, v27.8h, v3.8h\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v0.4s, v26.4h, v29.4h\n"
+ "smlal2 v22.4s, v26.8h, v29.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "tbz x16, #2, 17f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 16f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x16, #1, 18f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x21, [x15, #0x38]\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v25.4s, v29.8h, v8.8h\n"
- "add x21, x21, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x12, #0x38]\n"
+ "smlal v6.4s, v23.4h, v4.4h\n"
+ "smlal2 v2.4s, v23.8h, v4.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 21f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 20f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "tbz x16, #1, 22f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x26, [x15, #0x40]\n"
- "smlal v14.4s, v28.4h, v1.4h\n"
- "smlal2 v11.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "add x26, x26, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v31.s }[0], [x26], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[6], [x26]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "smlal v28.4s, v21.4h, v7.4h\n"
+ "smlal2 v9.4s, v21.8h, v7.8h\n"
+ "smlal v3.4s, v21.4h, v19.4h\n"
+ "smlal2 v30.4s, v21.8h, v19.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 25f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 24f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[4], [x26]\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v31.h }[0], [x26], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[2], [x26]\n"
+ "tbz x16, #1, 26f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v31.b }[0], [x26]\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x20, [x15, #0x48]\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v11.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x12, #0x48]\n"
+ "smlal v28.4s, v18.4h, v1.4h\n"
+ "smlal2 v9.4s, v18.8h, v1.8h\n"
+ "smlal v3.4s, v18.4h, v7.4h\n"
+ "smlal2 v30.4s, v18.8h, v7.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 29f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 28f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "tbz x16, #1, 30f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x25, [x15, #0x50]\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v16.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "add x25, x25, x17\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal2 v17.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v25.4s, v30.8h, v4.8h\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x12, #0x50]\n"
+ "smlal v28.4s, v15.4h, v4.4h\n"
+ "smlal2 v9.4s, v15.8h, v4.8h\n"
+ "smlal v3.4s, v15.4h, v16.4h\n"
+ "smlal2 v30.4s, v15.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v0.4s, v15.4h, v31.4h\n"
+ "smlal2 v22.4s, v15.8h, v31.8h\n"
+ "smlal v6.4s, v15.4h, v8.4h\n"
+ "smlal2 v2.4s, v15.8h, v8.8h\n"
+ "tbz x16, #2, 33f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 32f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x16, #1, 34f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x58]\n"
- "smlal v14.4s, v29.4h, v3.4h\n"
- "smlal2 v11.4s, v29.8h, v3.8h\n"
- "smlal v24.4s, v29.4h, v0.4h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v28.4s, v20.4h, v17.4h\n"
+ "smlal2 v9.4s, v20.8h, v17.8h\n"
+ "smlal v0.4s, v20.4h, v19.4h\n"
+ "smlal2 v22.4s, v20.8h, v19.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 37f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 36f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x16, #1, 38f\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x23, [x15, #0x60]\n"
- "smlal v16.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v25.4s, v28.8h, v2.8h\n"
- "add x23, x23, x17\n"
- "tbz x7, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x7, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v3.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "smlal v6.4s, v11.4h, v1.4h\n"
+ "smlal2 v2.4s, v11.8h, v1.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 41f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 40f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x16, #1, 42f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x22, [x15, #0x68]\n"
- "smlal v14.4s, v31.4h, v6.4h\n"
- "smlal2 v11.4s, v31.8h, v6.8h\n"
- "smlal v24.4s, v31.4h, v3.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "add x22, x22, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x12, #0x68]\n"
+ "smlal v28.4s, v23.4h, v29.4h\n"
+ "smlal2 v9.4s, v23.8h, v29.8h\n"
+ "smlal v0.4s, v23.4h, v17.4h\n"
+ "smlal2 v22.4s, v23.8h, v17.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 45f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 44f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x16, #1, 46f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v16.4s, v30.4h, v8.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v25.4s, v30.8h, v5.8h\n"
- "add x21, x21, x17\n"
- "tbz x7, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x7, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v3.4s, v20.4h, v4.4h\n"
+ "smlal2 v30.4s, v20.8h, v4.8h\n"
+ "smlal v6.4s, v20.4h, v31.4h\n"
+ "smlal2 v2.4s, v20.8h, v31.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 49f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 48f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x16, #1, 50f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v24.4s, v29.4h, v7.4h\n"
- "smlal2 v17.4s, v29.8h, v7.8h\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v25.4s, v29.8h, v6.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "ldr x20, [x12, #0x78]\n"
+ "smlal v0.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "smlal v6.4s, v8.4h, v29.4h\n"
+ "smlal2 v2.4s, v8.8h, v29.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 53f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 52f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x16, #1, 54f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v24.4s, v28.4h, v8.4h\n"
- "smlal2 v17.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v25.4s, v28.8h, v7.8h\n"
- "tbz x7, #2, 57f\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "ld1 { v10.4s }, [x12], #0x10\n"
- "tbz x7, #1, 56f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v26.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x12]\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "tbz x16, #2, 57f\n"
+ "ld1 { v7.4s }, [x10], #0x10\n"
+ "ld1 { v23.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 56f\n"
+ "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v27.d }[0], [x9], #0x8\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v11.s }[2], [x10]\n"
+ "ld1 { v27.s }[2], [x9]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v26.s }[0], [x12]\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v11.s }[0], [x10]\n"
+ "ld1 { v27.s }[0], [x9]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 58f\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "ld1 { v10.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v22.s }[2], [x13]\n"
- "ld1 { v10.s }[2], [x12]\n"
+ "tbz x16, #1, 58f\n"
+ "ld1 { v7.d }[0], [x10], #0x8\n"
+ "ld1 { v23.d }[0], [x9], #0x8\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v7.s }[2], [x10]\n"
+ "ld1 { v23.s }[2], [x9]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v22.s }[0], [x13]\n"
- "ld1 { v10.s }[0], [x12]\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v7.s }[0], [x10]\n"
+ "ld1 { v23.s }[0], [x9]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v14.4s, v14.4s, v22.4s\n"
- "and v21.16b, v14.16b, v10.16b\n"
- "add x11, x11, x16\n"
- "add x10, x10, x16\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x9, x9, x16\n"
- "add x28, x28, x16\n"
- "and v4.16b, v11.16b, v26.16b\n"
- "sqrdmulh v16.4s, v16.4s, v22.4s\n"
- "sqrdmulh v24.4s, v24.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v7.4s\n"
+ "and v20.16b, v28.16b, v23.16b\n"
+ "add x28, x28, x13\n"
+ "add x27, x27, x13\n"
+ "sqrdmulh v9.4s, v9.4s, v11.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "and v4.16b, v9.16b, v27.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v7.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v7.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v7.4s\n"
+ "sqadd v28.4s, v28.4s, v20.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v16.16b, v10.16b\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "and v3.16b, v24.16b, v10.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "and v21.16b, v23.16b, v10.16b\n"
- "sqrdmulh v25.4s, v25.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v19.16b, v3.16b, v23.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v11.4s\n"
+ "and v29.16b, v0.16b, v23.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v11.4s\n"
+ "and v26.16b, v6.16b, v23.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v11.4s\n"
+ "sqadd v9.4s, v9.4s, v4.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "and v27.16b, v9.16b, v26.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v5.16b, v17.16b, v26.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v4.16b, v25.16b, v26.16b\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v3.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v14.4s, v14.4s, v10.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v26.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v17.4s, v17.4s, v26.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v25.4s, v25.4s, v26.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v14.8h, v11.4s\n"
- "sqxtn2 v16.8h, v9.4s\n"
- "sqxtn2 v24.8h, v17.4s\n"
- "sqxtn2 v23.8h, v25.4s\n"
- "sqadd v14.8h, v14.8h, v20.8h\n"
- "sqadd v16.8h, v16.8h, v20.8h\n"
- "sqadd v24.8h, v24.8h, v20.8h\n"
- "sqadd v23.8h, v23.8h, v20.8h\n"
- "smax v14.8h, v14.8h, v15.8h\n"
- "smax v16.8h, v16.8h, v15.8h\n"
- "smax v24.8h, v24.8h, v15.8h\n"
- "smax v23.8h, v23.8h, v15.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v16.8h, v16.8h, v13.8h\n"
- "smin v24.8h, v24.8h, v13.8h\n"
- "smin v23.8h, v23.8h, v13.8h\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "tbz x7, #2, 61f\n"
- "st1 { v14.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v24.s }[0], [x9], #0x4\n"
- "st1 { v23.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 60f\n"
- "st1 { v14.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v24.h }[2], [x9], #0x2\n"
- "st1 { v23.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v24.b }[6], [x9], #0x1\n"
- "st1 { v23.b }[6], [x28], #0x1\n"
+ "and v17.16b, v30.16b, v27.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v8.16b, v22.16b, v27.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v13.16b, v2.16b, v27.16b\n"
+ "sqadd v3.4s, v3.4s, v19.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v29.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v13.4s, v13.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "srshl v3.4s, v3.4s, v23.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "sqadd v22.4s, v22.4s, v8.4s\n"
+ "srshl v6.4s, v6.4s, v23.4s\n"
+ "sqadd v2.4s, v2.4s, v13.4s\n"
+ "srshl v9.4s, v9.4s, v27.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v27.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "tbz x16, #2, 61f\n"
+ "st1 { v28.s }[0], [x28], #0x4\n"
+ "st1 { v3.s }[0], [x27], #0x4\n"
+ "st1 { v0.s }[0], [x26], #0x4\n"
+ "st1 { v6.s }[0], [x25], #0x4\n"
+ "tbz x16, #1, 60f\n"
+ "st1 { v28.h }[2], [x28], #0x2\n"
+ "st1 { v3.h }[2], [x27], #0x2\n"
+ "st1 { v0.h }[2], [x26], #0x2\n"
+ "st1 { v6.h }[2], [x25], #0x2\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[6], [x28], #0x1\n"
+ "st1 { v3.b }[6], [x27], #0x1\n"
+ "st1 { v0.b }[6], [x26], #0x1\n"
+ "st1 { v6.b }[6], [x25], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v24.b }[4], [x9], #0x1\n"
- "st1 { v23.b }[4], [x28], #0x1\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[4], [x28], #0x1\n"
+ "st1 { v3.b }[4], [x27], #0x1\n"
+ "st1 { v0.b }[4], [x26], #0x1\n"
+ "st1 { v6.b }[4], [x25], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "st1 { v14.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v24.h }[0], [x9], #0x2\n"
- "st1 { v23.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v24.b }[2], [x9], #0x1\n"
- "st1 { v23.b }[2], [x28], #0x1\n"
+ "tbz x16, #1, 62f\n"
+ "st1 { v28.h }[0], [x28], #0x2\n"
+ "st1 { v3.h }[0], [x27], #0x2\n"
+ "st1 { v0.h }[0], [x26], #0x2\n"
+ "st1 { v6.h }[0], [x25], #0x2\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[2], [x28], #0x1\n"
+ "st1 { v3.b }[2], [x27], #0x1\n"
+ "st1 { v0.b }[2], [x26], #0x1\n"
+ "st1 { v6.b }[2], [x25], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v14.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v24.b }[0], [x9], #0x1\n"
- "st1 { v23.b }[0], [x28], #0x1\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[0], [x28], #0x1\n"
+ "st1 { v3.b }[0], [x27], #0x1\n"
+ "st1 { v0.b }[0], [x26], #0x1\n"
+ "st1 { v6.b }[0], [x25], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index b27e8687e0..50778e9cbb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,8 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -35,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index a1e5c669b7..c807cb3ade 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,15 +104,15 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v19.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v22.8h }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
"mov x17, #0x0\n"
- "ld1r { v23.8h }, [x20]\n"
+ "ld1r { v5.8h }, [x20]\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
@@ -121,563 +121,563 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "usubl v0.8h, v0.8b, v19.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "usubl v2.8h, v2.8b, v19.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "usubl v3.8h, v3.8b, v19.8b\n"
- "usubl v4.8h, v4.8b, v19.8b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
- "usubl v6.8h, v6.8b, v19.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v19.8b\n"
- "usubl v8.8h, v8.8b, v19.8b\n"
- "ldr q15, [x22, #0x0]\n"
- "ldr q13, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "str x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "ldr d30, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q2, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v17.16b, v15.16b\n"
- "mov v20.16b, v13.16b\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v11.16b, v15.16b\n"
- "mov v10.16b, v13.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v9.16b, v15.16b\n"
- "mov v22.16b, v13.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d26, [x27, x17]\n"
+ "ldr d18, [x26, x17]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d10, [x25, x17]\n"
+ "ldr d27, [x24, x17]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr d17, [x23, x17]\n"
+ "ldr d19, [x22, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d15, [x21, x17]\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x13, #0x0]\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v13.4s, v31.8h, v8.8h\n"
- "ldr x23, [x15, #0x40]\n"
- "smlal v17.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x22, [x15, #0x48]\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v13.4s, v30.8h, v0.8h\n"
- "ldr q21, [x12, #0x0]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v13.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x17]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "ldr q31, [x13, #0x0]\n"
+ "ldr q0, [x12, #0x0]\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr q29, [x13, #0x10]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x25, [x15, #0x60]\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
"ldr d26, [x20, x17]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ldr x20, [x15, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v13.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v17.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal v20.4s, v18.4h, v24.4h\n"
+ "ldr x21, [x15, #0x40]\n"
"ldr x20, [x15, #0x70]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v13.4s, v27.8h, v5.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v17.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x23, x17]\n"
+ "smlal v16.4s, v26.4h, v23.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ldr d10, [x25, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x24, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v1.4s, v18.8h, v24.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal2 v14.4s, v26.8h, v23.8h\n"
"ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v4.4h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal v20.4s, v10.4h, v12.4h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "smlal v16.4s, v27.4h, v11.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ldr d19, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v13.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x17]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v9.4s, v26.4h, v5.4h\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x21, [x15, #0xb8]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v13.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
- "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal v9.4s, v24.4h, v3.4h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "ldr d17, [x20, x17]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal2 v1.4s, v10.8h, v12.8h\n"
+ "smlal2 v14.4s, v27.8h, v11.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal v20.4s, v18.4h, v23.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v16.4s, v26.4h, v7.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "ldr d12, [x23, x17]\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v23.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "smlal2 v14.4s, v26.8h, v7.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v21.4s, v19.4h, v23.4h\n"
+ "smlal v20.4s, v17.4h, v11.4h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v16.4s, v15.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v12.4h, v7.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v4.4s, v19.8h, v23.8h\n"
+ "ldr d23, [x22, x17]\n"
+ "ldr d19, [x21, x17]\n"
+ "smlal2 v1.4s, v17.8h, v11.8h\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal2 v14.4s, v15.8h, v25.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal v21.4s, v18.4h, v7.4h\n"
+ "smlal v20.4s, v26.4h, v3.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal v16.4s, v28.4h, v24.4h\n"
+ "smlal2 v2.4s, v12.8h, v7.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v8.4s, v10.4h, v3.4h\n"
+ "smlal2 v4.4s, v18.8h, v7.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v1.4s, v26.8h, v3.8h\n"
+ "smlal2 v14.4s, v28.8h, v24.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"add x14, x14, #0x48\n"
- "smlal v17.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "and v2.16b, v15.16b, v21.16b\n"
+ "smlal v21.4s, v12.4h, v24.4h\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "add x17, x17, #0x8\n"
"subs x8, x8, #0x1\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal v9.4s, v26.4h, v7.4h\n"
- "sqrdmulh v13.4s, v13.4s, v30.4s\n"
+ "smlal v16.4s, v19.4h, v9.4h\n"
+ "smlal2 v2.4s, v10.8h, v3.8h\n"
"add x13, x13, #0x20\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v10.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add x17, x17, #0x8\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "and v16.16b, v13.16b, v31.16b\n"
"add x12, x12, #0x20\n"
- "smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v18.16b, v17.16b, v21.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v28.16b, v11.16b, v21.16b\n"
- "sqrdmulh v10.4s, v10.4s, v30.4s\n"
- "and v2.16b, v9.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v4.4s, v12.8h, v24.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "smlal2 v14.4s, v19.8h, v9.8h\n"
+ "and v10.16b, v8.16b, v0.16b\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "smlal v16.4s, v11.4h, v3.4h\n"
+ "smlal2 v2.4s, v17.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+ "smlal2 v4.4s, v27.8h, v9.8h\n"
+ "smlal2 v1.4s, v28.8h, v7.8h\n"
+ "and v12.16b, v2.16b, v25.16b\n"
+ "smlal2 v14.4s, v11.8h, v3.8h\n"
+ "smlal v21.4s, v15.4h, v30.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v31.4s\n"
+ "smlal v20.4s, v11.4h, v30.4h\n"
+ "smlal v16.4s, v18.4h, v30.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+ "smlal2 v4.4s, v15.8h, v30.8h\n"
+ "smlal2 v1.4s, v11.8h, v30.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v31.4s\n"
+ "smlal2 v14.4s, v18.8h, v30.8h\n"
+ "sqadd v8.4s, v8.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v27.16b, v21.16b, v0.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+ "and v24.16b, v20.16b, v0.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v19.16b, v16.16b, v0.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+ "sqadd v2.4s, v2.4s, v12.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v25.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v17.16b, v1.16b, v25.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v14.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v20.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v3.16b, v10.16b, v31.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v16.16b, v22.16b, v31.16b\n"
- "sqadd v17.4s, v17.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "srshl v17.4s, v17.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v9.4s, v9.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v15.8h, v13.4s\n"
- "sqxtn2 v17.8h, v20.4s\n"
- "sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v9.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v17.8h, v17.8h, v12.8h\n"
- "sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v9.8h, v9.8h, v12.8h\n"
- "smax v15.8h, v15.8h, v14.8h\n"
- "smax v17.8h, v17.8h, v14.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v9.8h, v9.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v23.8h\n"
- "smin v17.8h, v17.8h, v23.8h\n"
- "smin v11.8h, v11.8h, v23.8h\n"
- "smin v9.8h, v9.8h, v23.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d17, [x10, x16]\n"
- "str d11, [x9, x16]\n"
- "str d9, [x28, x16]\n"
- "ldr q15, [x22, #0x0]\n"
- "ldr q13, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "sqadd v14.4s, v14.4s, v15.4s\n"
+ "srshl v2.4s, v2.4s, v25.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v25.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d16, [x28, x16]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q2, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x22, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v17.16b, v15.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v11.16b, v15.16b\n"
- "mov v10.16b, v13.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v9.16b, v15.16b\n"
- "mov v22.16b, v13.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d30, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v0.8h, v0.8b, v19.8b\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v2.8h, v2.8b, v19.8b\n"
- "usubl v3.8h, v3.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "usubl v4.8h, v4.8b, v19.8b\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "usubl v6.8h, v6.8b, v19.8b\n"
- "usubl v7.8h, v7.8b, v19.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "usubl v8.8h, v8.8b, v19.8b\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr d24, [x20, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d26, [x27, x17]\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "ldr d18, [x26, x17]\n"
+ "ldr d10, [x25, x17]\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "ldr d27, [x24, x17]\n"
+ "ldr d17, [x23, x17]\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr d19, [x22, x17]\n"
+ "ldr d15, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x13, #0x0]\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v13.4s, v31.8h, v8.8h\n"
- "ldr x23, [x15, #0x40]\n"
- "smlal v17.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x22, [x15, #0x48]\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v13.4s, v30.8h, v0.8h\n"
- "ldr q21, [x12, #0x0]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v13.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x17]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "ldr q0, [x13, #0x0]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr q29, [x13, #0x10]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x25, [x15, #0x60]\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
"ldr d26, [x20, x17]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ldr x20, [x15, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v13.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v17.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal v20.4s, v18.4h, v24.4h\n"
+ "ldr x21, [x15, #0x40]\n"
"ldr x20, [x15, #0x70]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v13.4s, v27.8h, v5.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v17.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x23, x17]\n"
+ "smlal v16.4s, v26.4h, v23.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ldr d10, [x25, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x24, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v1.4s, v18.8h, v24.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal2 v14.4s, v26.8h, v23.8h\n"
"ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v4.4h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal v20.4s, v10.4h, v12.4h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "smlal v16.4s, v27.4h, v11.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ldr d19, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v13.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x17]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v9.4s, v26.4h, v5.4h\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x21, [x15, #0xb8]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v13.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "ldr d17, [x20, x17]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal2 v1.4s, v10.8h, v12.8h\n"
+ "smlal2 v14.4s, v27.8h, v11.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal v20.4s, v18.4h, v23.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v16.4s, v26.4h, v7.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "ldr d12, [x23, x17]\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v23.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "smlal2 v14.4s, v26.8h, v7.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v21.4s, v19.4h, v23.4h\n"
+ "smlal v20.4s, v17.4h, v11.4h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v16.4s, v15.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v12.4h, v7.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v4.4s, v19.8h, v23.8h\n"
+ "ldr d23, [x22, x17]\n"
+ "ldr d19, [x21, x17]\n"
+ "smlal2 v1.4s, v17.8h, v11.8h\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal2 v14.4s, v15.8h, v25.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal v21.4s, v18.4h, v7.4h\n"
+ "smlal v20.4s, v26.4h, v3.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"ldr x20, [x15, #0xc0]\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v16.4s, v28.4h, v24.4h\n"
+ "smlal2 v2.4s, v12.8h, v7.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
"tst x7, #0x7\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal v9.4s, v24.4h, v3.4h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v3.4h\n"
+ "smlal2 v4.4s, v18.8h, v7.8h\n"
+ "ldr d18, [x20, x17]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v1.4s, v26.8h, v3.8h\n"
+ "smlal2 v14.4s, v28.8h, v24.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "add x17, x17, #0x8\n"
+ "smlal v21.4s, v12.4h, v24.4h\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
"add x13, x13, #0x20\n"
- "smlal v17.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "and v2.16b, v15.16b, v21.16b\n"
"add x12, x12, #0x20\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal v9.4s, v26.4h, v7.4h\n"
- "sqrdmulh v13.4s, v13.4s, v30.4s\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v10.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add x17, x17, #0x8\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "and v16.16b, v13.16b, v31.16b\n"
- "smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v18.16b, v17.16b, v21.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v28.16b, v11.16b, v21.16b\n"
- "sqrdmulh v10.4s, v10.4s, v30.4s\n"
- "and v2.16b, v9.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
+ "smlal v16.4s, v19.4h, v9.4h\n"
+ "smlal2 v2.4s, v10.8h, v3.8h\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v4.4s, v12.8h, v24.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v0.4s\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "smlal2 v14.4s, v19.8h, v9.8h\n"
+ "and v23.16b, v8.16b, v31.16b\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "smlal v16.4s, v11.4h, v3.4h\n"
+ "smlal2 v2.4s, v17.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+ "smlal2 v4.4s, v27.8h, v9.8h\n"
+ "smlal2 v1.4s, v28.8h, v7.8h\n"
+ "and v7.16b, v2.16b, v25.16b\n"
+ "smlal2 v14.4s, v11.8h, v3.8h\n"
+ "smlal v21.4s, v15.4h, v30.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "smlal v20.4s, v11.4h, v30.4h\n"
+ "smlal v16.4s, v18.4h, v30.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v0.4s\n"
+ "smlal2 v4.4s, v15.8h, v30.8h\n"
+ "smlal2 v1.4s, v11.8h, v30.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v0.4s\n"
+ "smlal2 v14.4s, v18.8h, v30.8h\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v31.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+ "and v24.16b, v20.16b, v31.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v19.16b, v16.16b, v31.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+ "sqadd v2.4s, v2.4s, v7.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v25.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v17.16b, v1.16b, v25.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v14.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v20.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v3.16b, v10.16b, v31.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v16.16b, v22.16b, v31.16b\n"
- "sqadd v17.4s, v17.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "srshl v17.4s, v17.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v9.4s, v9.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v31.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
"srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v15.8h, v13.4s\n"
- "sqxtn2 v17.8h, v20.4s\n"
- "sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v9.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v17.8h, v17.8h, v12.8h\n"
- "sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v9.8h, v9.8h, v12.8h\n"
- "smax v15.8h, v15.8h, v14.8h\n"
- "smax v17.8h, v17.8h, v14.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v9.8h, v9.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v23.8h\n"
- "smin v17.8h, v17.8h, v23.8h\n"
- "smin v11.8h, v11.8h, v23.8h\n"
- "smin v9.8h, v9.8h, v23.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d17, [x10, x16]\n"
- "str d11, [x9, x16]\n"
- "str d9, [x28, x16]\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqadd v14.4s, v14.4s, v15.4s\n"
+ "srshl v2.4s, v2.4s, v25.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v25.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d16, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x22], #0x10\n"
+ "ld1 { v8.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v2.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v13.s }[2], [x22]\n"
+ "ld1 { v2.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v13.s }[0], [x22]\n"
+ "ld1 { v2.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x22]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x22]\n"
+ "ld1 { v8.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v17.16b, v15.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v11.16b, v15.16b\n"
- "mov v10.16b, v13.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v22.16b, v13.16b\n"
- "ldr d6, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v0.8h, v0.8b, v19.8b\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "ldr d30, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "usubl v2.8h, v2.8b, v19.8b\n"
- "usubl v3.8h, v3.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "usubl v4.8h, v4.8b, v19.8b\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "usubl v6.8h, v6.8b, v19.8b\n"
- "usubl v7.8h, v7.8b, v19.8b\n"
- "usubl v8.8h, v8.8b, v19.8b\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -687,700 +687,700 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x27], #0x4\n"
+ "ld1 { v18.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x23], #0x4\n"
+ "ld1 { v19.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v18.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v15.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x27]\n"
+ "ld1 { v18.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v15.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x27]\n"
+ "ld1 { v18.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v15.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x27], #0x2\n"
+ "ld1 { v18.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x23], #0x2\n"
+ "ld1 { v19.h }[0], [x22], #0x2\n"
+ "ld1 { v15.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x27]\n"
+ "ld1 { v18.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v15.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x27]\n"
+ "ld1 { v18.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x25]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x23]\n"
+ "ld1 { v19.b }[0], [x22]\n"
+ "ld1 { v15.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v13.4s, v31.8h, v8.8h\n"
- "ldr x23, [x15, #0x40]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v13.4s, v30.8h, v0.8h\n"
- "add x23, x23, x17\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v17.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v13.4s, v29.8h, v1.8h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v13.4s, v25.8h, v4.8h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x23], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x23], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x23]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x23]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x23], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x23]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x23]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v17.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v31.4h, v23.4h\n"
+ "smlal2 v4.4s, v31.8h, v23.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v28.4h, v7.4h\n"
+ "smlal2 v4.4s, v28.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v13.4s, v27.8h, v5.8h\n"
- "smlal v17.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v8.4s, v27.4h, v7.4h\n"
+ "smlal2 v2.4s, v27.8h, v7.8h\n"
+ "smlal v21.4s, v27.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v24.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v0.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v0.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v0.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v0.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v0.4h, v24.4h\n"
+ "smlal2 v1.4s, v0.8h, v24.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v13.4s, v25.8h, v6.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "smlal v8.4s, v15.4h, v3.4h\n"
+ "smlal2 v2.4s, v15.8h, v3.8h\n"
+ "smlal v20.4s, v15.4h, v12.4h\n"
+ "smlal2 v1.4s, v15.8h, v12.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v0.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v0.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v0.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v0.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
"ldr x20, [x15, #0x70]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "smlal2 v10.4s, v29.8h, v4.8h\n"
+ "smlal v20.4s, v0.4h, v23.4h\n"
+ "smlal2 v1.4s, v0.8h, v23.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v13.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v8.4s, v6.4h, v9.4h\n"
+ "smlal2 v2.4s, v6.8h, v9.8h\n"
+ "smlal v20.4s, v6.4h, v11.4h\n"
+ "smlal2 v1.4s, v6.8h, v11.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v9.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v16.4s, v27.4h, v23.4h\n"
+ "smlal2 v14.4s, v27.8h, v23.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v17.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v10.4h, v9.4h\n"
+ "smlal2 v4.4s, v10.8h, v9.8h\n"
+ "smlal v16.4s, v10.4h, v11.4h\n"
+ "smlal2 v14.4s, v10.8h, v11.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x21, [x15, #0x90]\n"
- "smlal v9.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v16.4s, v28.4h, v7.4h\n"
+ "smlal2 v14.4s, v28.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "add x24, x24, x17\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v15.4h, v3.4h\n"
+ "smlal2 v1.4s, v15.8h, v3.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v6.4h, v30.4h\n"
+ "smlal2 v4.4s, v6.8h, v30.8h\n"
+ "smlal v16.4s, v6.4h, v25.4h\n"
+ "smlal2 v14.4s, v6.8h, v25.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v23.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal2 v10.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal v9.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v12.4h, v7.4h\n"
+ "smlal2 v1.4s, v12.8h, v7.8h\n"
+ "smlal v16.4s, v12.4h, v24.4h\n"
+ "smlal2 v14.4s, v12.8h, v24.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v9.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v16.4s, v10.4h, v9.4h\n"
+ "smlal2 v14.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v15.4h, v30.4h\n"
+ "smlal2 v1.4s, v15.8h, v30.8h\n"
+ "smlal v16.4s, v15.4h, v3.4h\n"
+ "smlal2 v14.4s, v15.8h, v3.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v16.4s, v28.4h, v30.4h\n"
+ "smlal2 v14.4s, v28.8h, v30.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v18.4s }, [x13], #0x10\n"
- "ld1 { v21.4s }, [x12], #0x10\n"
+ "ld1 { v19.4s }, [x13], #0x10\n"
+ "ld1 { v23.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v24.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v24.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v24.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v21.d }[0], [x12], #0x8\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v21.s }[2], [x12]\n"
+ "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v23.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v21.s }[0], [x12]\n"
+ "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v23.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "and v2.16b, v15.16b, v21.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+ "and v17.16b, v8.16b, v23.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v13.4s, v13.4s, v30.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v16.16b, v13.16b, v31.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v18.16b, v17.16b, v21.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v28.16b, v11.16b, v21.16b\n"
- "sqrdmulh v10.4s, v10.4s, v30.4s\n"
- "and v2.16b, v9.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v20.16b, v31.16b\n"
+ "and v11.16b, v2.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v28.16b, v21.16b, v23.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+ "and v19.16b, v16.16b, v23.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "sqadd v2.4s, v2.4s, v11.4s\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "and v3.16b, v10.16b, v31.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v16.16b, v22.16b, v31.16b\n"
- "sqadd v17.4s, v17.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v2.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "srshl v17.4s, v17.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v9.4s, v9.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v15.8h, v13.4s\n"
- "sqxtn2 v17.8h, v20.4s\n"
- "sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v9.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v17.8h, v17.8h, v12.8h\n"
- "sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v9.8h, v9.8h, v12.8h\n"
- "smax v15.8h, v15.8h, v14.8h\n"
- "smax v17.8h, v17.8h, v14.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v9.8h, v9.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v23.8h\n"
- "smin v17.8h, v17.8h, v23.8h\n"
- "smin v11.8h, v11.8h, v23.8h\n"
- "smin v9.8h, v9.8h, v23.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "and v18.16b, v4.16b, v24.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v12.16b, v1.16b, v24.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v25.16b, v14.16b, v24.16b\n"
+ "sqadd v21.4s, v21.4s, v28.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v23.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "sqadd v1.4s, v1.4s, v12.4s\n"
+ "srshl v16.4s, v16.4s, v23.4s\n"
+ "sqadd v14.4s, v14.4s, v25.4s\n"
+ "srshl v2.4s, v2.4s, v24.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v24.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v24.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v17.s }[0], [x10], #0x4\n"
- "st1 { v11.s }[0], [x9], #0x4\n"
- "st1 { v9.s }[0], [x28], #0x4\n"
+ "st1 { v8.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v16.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v17.h }[2], [x10], #0x2\n"
- "st1 { v11.h }[2], [x9], #0x2\n"
- "st1 { v9.h }[2], [x28], #0x2\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v17.b }[6], [x10], #0x1\n"
- "st1 { v11.b }[6], [x9], #0x1\n"
- "st1 { v9.b }[6], [x28], #0x1\n"
+ "st1 { v8.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v16.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v17.b }[4], [x10], #0x1\n"
- "st1 { v11.b }[4], [x9], #0x1\n"
- "st1 { v9.b }[4], [x28], #0x1\n"
+ "st1 { v8.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v16.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v17.h }[0], [x10], #0x2\n"
- "st1 { v11.h }[0], [x9], #0x2\n"
- "st1 { v9.h }[0], [x28], #0x2\n"
+ "st1 { v8.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v16.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v17.b }[2], [x10], #0x1\n"
- "st1 { v11.b }[2], [x9], #0x1\n"
- "st1 { v9.b }[2], [x28], #0x1\n"
+ "st1 { v8.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v16.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v17.b }[0], [x10], #0x1\n"
- "st1 { v11.b }[0], [x9], #0x1\n"
- "st1 { v9.b }[0], [x28], #0x1\n"
+ "st1 { v8.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v16.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 7075f58f92..f2ab5831d8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,8 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -35,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const uint8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index eec3ba5900..c8fe567e77 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -111,2071 +111,2071 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x9, x4, #0x3\n"
- "add x24, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v9.16b }, [x24]\n"
- "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x28, x22, %[offsetof_Requantize32_c_offset]\n"
- "add x24, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v15.8h }, [x28]\n"
- "ld1r { v14.8h }, [x24]\n"
- "add x20, x22, %[offsetof_Requantize32_maxval]\n"
- "mov x3, #0x0\n"
+ "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x3, x2, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v2.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v25.8h }, [x21]\n"
"ld1r { v12.8h }, [x20]\n"
- "mov x1, #0x0\n"
- "add x2, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x0, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x7, x8, [x25, #0x0]\n"
- "ldp x17, x16, [x25, #0x10]\n"
- "cbz x9, 3f\n"
- "ldr d0, [x0, #0x0]\n"
- "ldr d1, [x0, #0x8]\n"
- "subs x9, x9, #0x1\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr d2, [x0, #0x10]\n"
- "ldr d3, [x0, #0x18]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr d4, [x0, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr q11, [x13, #0x0]\n"
- "ldr q13, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x10, x28, [x2, #0x0]\n"
- "ldp x27, x26, [x2, #0x10]\n"
- "mov v20.16b, v11.16b\n"
- "mov v19.16b, v13.16b\n"
- "ldp x25, x24, [x2, #0x20]\n"
- "ldp x23, x22, [x2, #0x30]\n"
- "mov v8.16b, v11.16b\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x4, #0x0\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "mov x5, #0x0\n"
+ "add x6, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x16, x15, [x22, #0x0]\n"
+ "ldp x14, x13, [x22, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "subs x3, x3, #0x1\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ldr q13, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "ldp x27, x26, [x6, #0x10]\n"
"mov v7.16b, v13.16b\n"
- "ldp x21, x20, [x2, #0x40]\n"
- "ldr d31, [x10, x3]\n"
- "mov v6.16b, v11.16b\n"
- "mov v5.16b, v13.16b\n"
- "ldr d30, [x28, x3]\n"
- "ldr d29, [x27, x3]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
+ "mov v14.16b, v24.16b\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "ldr d10, [x9, x4]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldr d16, [x28, x4]\n"
+ "ldr d23, [x27, x4]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr d30, [x26, x4]\n"
+ "ldr d4, [x25, x4]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr d28, [x26, x3]\n"
- "ldr d27, [x25, x3]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr d28, [x24, x4]\n"
+ "ldr d31, [x23, x4]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr d23, [x24, x3]\n"
- "ldr d25, [x23, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ldr d24, [x22, x3]\n"
- "ldr d26, [x21, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr d22, [x20, x3]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ushll v22.8h, v22.8b, #0x0\n"
+ "ldr d1, [x22, x4]\n"
+ "ldr d9, [x21, x4]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ldr d11, [x20, x4]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x6, #0x0]\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr q16, [x6, #0x10]\n"
- "ldr q10, [x5, #0x10]\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ldr x20, [x2, #0x50]\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "ldr x22, [x2, #0x58]\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x3]\n"
+ "ldr d5, [x7, #0x28]\n"
+ "ldr d6, [x7, #0x30]\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "ldr d19, [x7, #0x38]\n"
+ "ldr d0, [x7, #0x40]\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "ldr d10, [x7, #0x48]\n"
+ "ldr d20, [x7, #0x50]\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "ldr x21, [x6, #0x50]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "ldr d16, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "ldr d21, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "ldr x21, [x6, #0x70]\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "ldr d4, [x22, x4]\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal v8.4s, v16.4h, v29.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "ldr d31, [x21, x4]\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "ldr x21, [x2, #0x60]\n"
- "ldr x20, [x2, #0x68]\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x3]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "ldr x25, [x2, #0x70]\n"
- "ldr x26, [x2, #0x78]\n"
- "smlal2 v13.4s, v27.8h, v2.8h\n"
- "smlal2 v19.4s, v27.8h, v1.8h\n"
- "ldr d0, [x0, #0x28]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "ldr x23, [x2, #0x80]\n"
- "ldr x24, [x2, #0x88]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x3]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "ldr x15, [x2, #0x90]\n"
- "ldr x21, [x2, #0x98]\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d1, [x0, #0x30]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "ldr x14, [x2, #0xa0]\n"
- "ldr x13, [x2, #0xa8]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x3]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "ldr x12, [x2, #0xb0]\n"
- "ldr x20, [x2, #0xb8]\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d2, [x0, #0x38]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x11, [x2, #0xc0]\n"
- "ldr x10, [x2, #0xc8]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x25, x3]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "ldr x22, [x2, #0xd0]\n"
- "ldr x28, [x2, #0xd8]\n"
- "smlal2 v13.4s, v29.8h, v0.8h\n"
- "ldr d3, [x0, #0x40]\n"
- "smlal2 v19.4s, v27.8h, v4.8h\n"
- "ldr d27, [x26, x3]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x27, [x2, #0xe0]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x26, [x2, #0xe8]\n"
- "smlal2 v13.4s, v28.8h, v1.8h\n"
- "smlal2 v19.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x3]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x25, [x2, #0xf0]\n"
- "subs x9, x9, #0x1\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d0, [x0, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "add x6, x6, #0x20\n"
- "add x5, x5, #0x20\n"
- "smlal2 v13.4s, v23.8h, v2.8h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x3]\n"
+ "smlal2 v17.4s, v16.8h, v29.8h\n"
+ "ldr d29, [x20, x4]\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "smlal v27.4s, v16.4h, v18.4h\n"
+ "smlal v8.4s, v21.4h, v18.4h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v5.4h\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x21, [x6, #0x90]\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "ldr d1, [x22, x4]\n"
+ "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v17.4s, v21.8h, v18.8h\n"
+ "ldr d18, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v3.4h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "ldr x20, [x6, #0x98]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v24.4s, v23.8h, v5.8h\n"
+ "ldr d23, [x7, #0x58]\n"
+ "smlal v13.4s, v30.4h, v6.4h\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "smlal2 v14.4s, v4.8h, v3.8h\n"
+ "ldr d4, [x21, x4]\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "ldr x23, [x6, #0xa0]\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x4]\n"
+ "smlal v7.4s, v30.4h, v5.4h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v27.4s, v11.4h, v5.4h\n"
+ "smlal v8.4s, v15.4h, v5.4h\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x22, [x6, #0xa8]\n"
+ "smlal2 v24.4s, v30.8h, v6.8h\n"
+ "smlal v13.4s, v28.4h, v19.4h\n"
+ "ldr x21, [x6, #0xb0]\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal2 v14.4s, v30.8h, v5.8h\n"
+ "ldr d30, [x7, #0x60]\n"
+ "smlal2 v22.4s, v11.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "smlal2 v17.4s, v15.8h, v5.8h\n"
+ "ldr d5, [x23, x4]\n"
+ "smlal v7.4s, v28.4h, v6.4h\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "smlal v27.4s, v15.4h, v6.4h\n"
+ "smlal v8.4s, v31.4h, v6.4h\n"
+ "ldr x12, [x6, #0xc0]\n"
+ "ldr x11, [x6, #0xc8]\n"
+ "smlal2 v24.4s, v28.8h, v19.8h\n"
+ "smlal v13.4s, v16.4h, v0.4h\n"
+ "ldr x10, [x6, #0xd0]\n"
+ "ldr x9, [x6, #0xd8]\n"
+ "smlal2 v14.4s, v28.8h, v6.8h\n"
+ "ldr d28, [x7, #0x68]\n"
+ "smlal2 v22.4s, v15.8h, v6.8h\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr d6, [x22, x4]\n"
+ "smlal v7.4s, v16.4h, v19.4h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v27.4s, v31.4h, v19.4h\n"
+ "smlal v8.4s, v29.4h, v19.4h\n"
+ "ldr x28, [x6, #0xe0]\n"
+ "ldr x27, [x6, #0xe8]\n"
+ "smlal2 v24.4s, v16.8h, v0.8h\n"
+ "smlal v13.4s, v21.4h, v10.4h\n"
+ "ldr x26, [x6, #0xf0]\n"
+ "ldr x25, [x6, #0xf8]\n"
+ "smlal2 v14.4s, v16.8h, v19.8h\n"
+ "ldr d16, [x7, #0x70]\n"
+ "smlal2 v22.4s, v31.8h, v19.8h\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "smlal2 v17.4s, v29.8h, v19.8h\n"
+ "ldr d19, [x21, x4]\n"
+ "smlal v7.4s, v21.4h, v0.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v27.4s, v29.4h, v0.4h\n"
+ "smlal v8.4s, v1.4h, v0.4h\n"
+ "ldr x24, [x6, #0x100]\n"
+ "ldr x23, [x6, #0x108]\n"
+ "smlal2 v24.4s, v21.8h, v10.8h\n"
+ "smlal v13.4s, v11.4h, v20.4h\n"
+ "ldr x22, [x6, #0x110]\n"
+ "ldr x21, [x6, #0x118]\n"
+ "smlal2 v14.4s, v21.8h, v0.8h\n"
+ "ldr d21, [x7, #0x78]\n"
+ "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "smlal2 v17.4s, v1.8h, v0.8h\n"
+ "ldr d0, [x20, x4]\n"
+ "smlal v7.4s, v9.4h, v10.4h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v27.4s, v1.4h, v10.4h\n"
+ "smlal v8.4s, v18.4h, v10.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v24.4s, v11.8h, v20.8h\n"
+ "ldr d11, [x7, #0x80]\n"
+ "smlal v13.4s, v15.4h, v23.4h\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "smlal2 v14.4s, v9.8h, v10.8h\n"
+ "ldr d9, [x12, x4]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal2 v17.4s, v18.8h, v10.8h\n"
+ "ldr d10, [x11, x4]\n"
+ "smlal v7.4s, v15.4h, v20.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v27.4s, v4.4h, v20.4h\n"
+ "smlal v8.4s, v3.4h, v20.4h\n"
+ "smlal2 v24.4s, v15.8h, v23.8h\n"
+ "smlal v13.4s, v31.4h, v30.4h\n"
+ "smlal2 v14.4s, v15.8h, v20.8h\n"
+ "ldr d15, [x7, #0x88]\n"
+ "smlal2 v22.4s, v4.8h, v20.8h\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "smlal2 v17.4s, v3.8h, v20.8h\n"
+ "ldr d20, [x10, x4]\n"
+ "smlal v7.4s, v31.4h, v23.4h\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v27.4s, v3.4h, v23.4h\n"
+ "smlal v8.4s, v5.4h, v23.4h\n"
+ "smlal2 v24.4s, v31.8h, v30.8h\n"
+ "smlal v13.4s, v29.4h, v28.4h\n"
+ "smlal2 v14.4s, v31.8h, v23.8h\n"
+ "ldr d31, [x7, #0x90]\n"
+ "smlal2 v22.4s, v3.8h, v23.8h\n"
+ "usubl v31.8h, v31.8b, v2.8b\n"
+ "smlal2 v17.4s, v5.8h, v23.8h\n"
+ "ldr d23, [x9, x4]\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr x24, [x2, #0xf8]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "ldr d1, [x0, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v13.4s, v31.8h, v3.8h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x3]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x23, [x2, #0x100]\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d2, [x0, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v13.4s, v30.8h, v4.8h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x3]\n"
+ "smlal v27.4s, v5.4h, v30.4h\n"
+ "smlal v8.4s, v6.4h, v30.4h\n"
+ "smlal2 v24.4s, v29.8h, v28.8h\n"
+ "smlal v13.4s, v1.4h, v16.4h\n"
+ "smlal2 v14.4s, v29.8h, v30.8h\n"
+ "ldr d29, [x7, #0x98]\n"
+ "smlal2 v22.4s, v5.8h, v30.8h\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "smlal2 v17.4s, v6.8h, v30.8h\n"
+ "ldr d30, [x28, x4]\n"
+ "smlal v7.4s, v1.4h, v28.4h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "ldr x15, [x2, #0x108]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "ldr d3, [x0, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v13.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x3]\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x3]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x21, [x2, #0x110]\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x0, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "ldr x20, [x2, #0x118]\n"
- "smlal2 v13.4s, v25.8h, v1.8h\n"
- "smlal2 v19.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d0, [x0, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal2 v19.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "ldr d1, [x0, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v27.8h, v3.8h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d2, [x0, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v13.4s, v23.8h, v4.8h\n"
- "smlal2 v19.4s, v23.8h, v3.8h\n"
- "ldr d23, [x10, x3]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr d3, [x0, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x3]\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x3]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "ldr d4, [x0, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal v27.4s, v6.4h, v28.4h\n"
+ "smlal v8.4s, v19.4h, v28.4h\n"
+ "smlal2 v24.4s, v1.8h, v16.8h\n"
+ "smlal v13.4s, v4.4h, v21.4h\n"
+ "smlal2 v14.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x7, #0xa0]\n"
+ "smlal2 v22.4s, v6.8h, v28.8h\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "smlal2 v17.4s, v19.8h, v28.8h\n"
+ "ldr d28, [x27, x4]\n"
+ "smlal v7.4s, v18.4h, v16.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x3]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "ldr d0, [x0, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v13.4s, v26.8h, v2.8h\n"
- "smlal2 v19.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x3]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d1, [x0, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d2, [x0, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "ldr d3, [x0, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v13.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x3]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "add x0, x0, #0xc8\n"
- "smlal2 v13.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v13.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v2.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "add x3, x3, #0x8\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v31.16b, v11.16b, v21.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "smlal2 v13.4s, v28.8h, v4.8h\n"
- "smlal2 v19.4s, v28.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "and v17.16b, v13.16b, v10.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v20.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "and v18.16b, v8.16b, v21.16b\n"
- "sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "and v31.16b, v6.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v17.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v27.16b, v19.16b, v10.16b\n"
+ "smlal v27.4s, v19.4h, v16.4h\n"
+ "smlal v8.4s, v0.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v21.8h\n"
+ "ldr d4, [x7, #0xa8]\n"
+ "smlal v13.4s, v3.4h, v11.4h\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "smlal2 v14.4s, v18.8h, v16.8h\n"
+ "ldr d18, [x26, x4]\n"
+ "smlal2 v22.4s, v19.8h, v16.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v17.4s, v0.8h, v16.8h\n"
+ "ldr d16, [x25, x4]\n"
+ "smlal v7.4s, v3.4h, v21.4h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v27.4s, v9.4h, v21.4h\n"
+ "smlal v8.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v3.8h, v11.8h\n"
+ "smlal v13.4s, v5.4h, v15.4h\n"
+ "smlal2 v14.4s, v3.8h, v21.8h\n"
+ "ldr d3, [x7, #0xb0]\n"
+ "smlal2 v22.4s, v9.8h, v21.8h\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "smlal2 v17.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x24, x4]\n"
+ "smlal v7.4s, v5.4h, v11.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v27.4s, v10.4h, v11.4h\n"
+ "smlal v8.4s, v20.4h, v11.4h\n"
+ "smlal2 v24.4s, v5.8h, v15.8h\n"
+ "smlal v13.4s, v6.4h, v31.4h\n"
+ "smlal2 v14.4s, v5.8h, v11.8h\n"
+ "ldr d5, [x7, #0xb8]\n"
+ "smlal2 v22.4s, v10.8h, v11.8h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "smlal2 v17.4s, v20.8h, v11.8h\n"
+ "ldr d11, [x23, x4]\n"
+ "smlal v7.4s, v6.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v27.4s, v20.4h, v15.4h\n"
+ "smlal v8.4s, v23.4h, v15.4h\n"
+ "smlal2 v24.4s, v6.8h, v31.8h\n"
+ "smlal v13.4s, v19.4h, v29.4h\n"
+ "smlal2 v14.4s, v6.8h, v15.8h\n"
+ "ldr d6, [x7, #0xc0]\n"
+ "smlal2 v22.4s, v20.8h, v15.8h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "smlal2 v17.4s, v23.8h, v15.8h\n"
+ "ldr d15, [x22, x4]\n"
+ "smlal v7.4s, v19.4h, v31.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v27.4s, v23.4h, v31.4h\n"
+ "smlal v8.4s, v30.4h, v31.4h\n"
+ "add x7, x7, #0xc8\n"
+ "smlal2 v24.4s, v19.8h, v29.8h\n"
+ "smlal v13.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v31.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v31.8h\n"
+ "ldr q31, [x8, #0x0]\n"
+ "smlal v7.4s, v0.4h, v29.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v27.4s, v30.4h, v29.4h\n"
+ "smlal v8.4s, v28.4h, v29.4h\n"
+ "smlal2 v24.4s, v9.8h, v1.8h\n"
+ "ldr q9, [x17, #0x0]\n"
+ "smlal v13.4s, v10.4h, v4.4h\n"
+ "smlal2 v14.4s, v0.8h, v29.8h\n"
+ "ldr q0, [x8, #0x10]\n"
+ "smlal2 v22.4s, v30.8h, v29.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v17.4s, v28.8h, v29.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal v27.4s, v18.4h, v1.4h\n"
+ "smlal v8.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v4.8h\n"
+ "smlal v13.4s, v20.4h, v3.4h\n"
+ "smlal2 v14.4s, v10.8h, v1.8h\n"
+ "smlal2 v22.4s, v18.8h, v1.8h\n"
+ "smlal2 v17.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v20.4h, v4.4h\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v8.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v20.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v5.4h\n"
+ "smlal2 v14.4s, v20.8h, v4.8h\n"
+ "smlal2 v22.4s, v16.8h, v4.8h\n"
+ "smlal2 v17.4s, v21.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v24.4s, v23.8h, v5.8h\n"
+ "smlal v13.4s, v30.4h, v6.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "smlal2 v14.4s, v23.8h, v3.8h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "and v23.16b, v13.16b, v9.16b\n"
+ "smlal2 v17.4s, v11.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v5.4h\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "smlal v27.4s, v11.4h, v5.4h\n"
+ "smlal v8.4s, v15.4h, v5.4h\n"
+ "sqadd v13.4s, v13.4s, v23.4s\n"
+ "smlal2 v24.4s, v30.8h, v6.8h\n"
+ "smlal2 v14.4s, v30.8h, v5.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+ "smlal2 v22.4s, v11.8h, v5.8h\n"
+ "smlal2 v17.4s, v15.8h, v5.8h\n"
+ "and v10.16b, v24.16b, v29.16b\n"
+ "smlal v7.4s, v28.4h, v6.4h\n"
+ "smlal v27.4s, v15.4h, v6.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+ "smlal v8.4s, v19.4h, v6.4h\n"
+ "smlal2 v14.4s, v28.8h, v6.8h\n"
+ "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+ "smlal2 v22.4s, v15.8h, v6.8h\n"
+ "smlal2 v17.4s, v19.8h, v6.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v28.16b, v7.16b, v9.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v0.4s\n"
+ "and v20.16b, v27.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v0.4s\n"
+ "and v23.16b, v8.16b, v9.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v22.16b, v29.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v29.16b\n"
+ "sqadd v7.4s, v7.4s, v28.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v25.16b, v7.16b, v10.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v17.16b, v5.16b, v10.16b\n"
- "sqadd v20.4s, v20.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v27.4s\n"
- "srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "srshl v6.4s, v6.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v9.4s\n"
+ "srshl v7.4s, v7.4s, v9.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v29.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v8.4h, v8.4s\n"
- "srshl v5.4s, v5.4s, v10.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v11.8h, v13.4s\n"
- "sqxtn2 v20.8h, v19.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v11.8h, v11.8h, v15.8h\n"
- "sqadd v20.8h, v20.8h, v15.8h\n"
- "sqadd v8.8h, v8.8h, v15.8h\n"
- "sqadd v6.8h, v6.8h, v15.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v20.8h, v20.8h, v14.8h\n"
- "smax v8.8h, v8.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v11.8h, v11.8h, v12.8h\n"
- "smin v20.8h, v20.8h, v12.8h\n"
- "smin v8.8h, v8.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d11, [x7, x1]\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x16, x5]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str d7, [x15, x5]\n"
"uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d20, [x8, x1]\n"
- "str d8, [x17, x1]\n"
- "str d6, [x16, x1]\n"
- "ldr q11, [x13, #0x0]\n"
- "ldr q13, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x0, #0x0]\n"
- "ldr d1, [x0, #0x8]\n"
- "add x1, x1, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x0, #0x10]\n"
- "ldr d3, [x0, #0x18]\n"
- "mov v20.16b, v11.16b\n"
- "mov v19.16b, v13.16b\n"
- "ldr d4, [x0, #0x20]\n"
- "ldp x10, x28, [x2, #0x0]\n"
- "mov v8.16b, v11.16b\n"
+ "str d27, [x14, x5]\n"
+ "str d8, [x13, x5]\n"
+ "ldr q13, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "add x5, x5, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
"mov v7.16b, v13.16b\n"
- "ldp x27, x26, [x2, #0x10]\n"
- "ldp x25, x24, [x2, #0x20]\n"
- "mov v6.16b, v11.16b\n"
- "mov v5.16b, v13.16b\n"
- "ldp x23, x22, [x2, #0x30]\n"
- "ldp x21, x20, [x2, #0x40]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr d31, [x10, x3]\n"
- "ldr d30, [x28, x3]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr d29, [x27, x3]\n"
- "ldr d28, [x26, x3]\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr d27, [x25, x3]\n"
- "ldr d23, [x24, x3]\n"
+ "mov v14.16b, v24.16b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "ldr d10, [x9, x4]\n"
+ "ldr d16, [x28, x4]\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "ldr d23, [x27, x4]\n"
+ "ldr d30, [x26, x4]\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d4, [x25, x4]\n"
+ "ldr d28, [x24, x4]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr d31, [x23, x4]\n"
+ "ldr d1, [x22, x4]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr d25, [x23, x3]\n"
- "ldr d24, [x22, x3]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "ldr d9, [x21, x4]\n"
+ "ldr d11, [x20, x4]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr d26, [x21, x3]\n"
- "ldr d22, [x20, x3]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ushll v22.8h, v22.8b, #0x0\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x6, #0x0]\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr q16, [x6, #0x10]\n"
- "ldr q10, [x5, #0x10]\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ldr x20, [x2, #0x50]\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "ldr x22, [x2, #0x58]\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x3]\n"
+ "ldr d0, [x7, #0x28]\n"
+ "ldr d20, [x7, #0x30]\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "ldr d6, [x7, #0x38]\n"
+ "ldr d19, [x7, #0x40]\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "ldr d10, [x7, #0x48]\n"
+ "ldr d5, [x7, #0x50]\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "ldr x21, [x6, #0x50]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "ldr d16, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "ldr d21, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "ldr x21, [x6, #0x70]\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "ldr d4, [x22, x4]\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal v8.4s, v16.4h, v29.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "ldr d31, [x21, x4]\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "ldr x21, [x2, #0x60]\n"
- "ldr x20, [x2, #0x68]\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x3]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "ldr x25, [x2, #0x70]\n"
- "ldr x26, [x2, #0x78]\n"
- "smlal2 v13.4s, v27.8h, v2.8h\n"
- "smlal2 v19.4s, v27.8h, v1.8h\n"
- "ldr d0, [x0, #0x28]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "ldr x23, [x2, #0x80]\n"
- "ldr x24, [x2, #0x88]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x3]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "ldr x15, [x2, #0x90]\n"
- "ldr x21, [x2, #0x98]\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d1, [x0, #0x30]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "ldr x14, [x2, #0xa0]\n"
- "ldr x13, [x2, #0xa8]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x3]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "ldr x12, [x2, #0xb0]\n"
- "ldr x20, [x2, #0xb8]\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d2, [x0, #0x38]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x11, [x2, #0xc0]\n"
- "ldr x10, [x2, #0xc8]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x25, x3]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "ldr x22, [x2, #0xd0]\n"
- "ldr x28, [x2, #0xd8]\n"
- "smlal2 v13.4s, v29.8h, v0.8h\n"
- "ldr d3, [x0, #0x40]\n"
- "smlal2 v19.4s, v27.8h, v4.8h\n"
- "ldr d27, [x26, x3]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x27, [x2, #0xe0]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x26, [x2, #0xe8]\n"
- "smlal2 v13.4s, v28.8h, v1.8h\n"
- "smlal2 v19.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x3]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x25, [x2, #0xf0]\n"
- "ldr x24, [x2, #0xf8]\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d0, [x0, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "tst x4, #0x7\n"
- "add x6, x6, #0x20\n"
- "smlal2 v13.4s, v23.8h, v2.8h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x3]\n"
+ "smlal2 v17.4s, v16.8h, v29.8h\n"
+ "ldr d29, [x20, x4]\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "smlal v27.4s, v16.4h, v18.4h\n"
+ "smlal v8.4s, v21.4h, v18.4h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v0.4h\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x21, [x6, #0x90]\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "ldr d1, [x22, x4]\n"
+ "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v17.4s, v21.8h, v18.8h\n"
+ "ldr d18, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v3.4h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "ldr x20, [x6, #0x98]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v24.4s, v23.8h, v0.8h\n"
+ "ldr d23, [x7, #0x58]\n"
+ "smlal v13.4s, v30.4h, v20.4h\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "smlal2 v14.4s, v4.8h, v3.8h\n"
+ "ldr d4, [x21, x4]\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "ldr x22, [x6, #0xa0]\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x4]\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v27.4s, v11.4h, v0.4h\n"
+ "smlal v8.4s, v15.4h, v0.4h\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x21, [x6, #0xa8]\n"
+ "smlal2 v24.4s, v30.8h, v20.8h\n"
+ "smlal v13.4s, v28.4h, v6.4h\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "ldr x12, [x6, #0xb8]\n"
+ "smlal2 v14.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x7, #0x60]\n"
+ "smlal2 v22.4s, v11.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "smlal2 v17.4s, v15.8h, v0.8h\n"
+ "ldr d0, [x22, x4]\n"
+ "smlal v7.4s, v28.4h, v20.4h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v27.4s, v15.4h, v20.4h\n"
+ "smlal v8.4s, v31.4h, v20.4h\n"
+ "ldr x11, [x6, #0xc0]\n"
+ "ldr x10, [x6, #0xc8]\n"
+ "smlal2 v24.4s, v28.8h, v6.8h\n"
+ "smlal v13.4s, v16.4h, v19.4h\n"
+ "ldr x9, [x6, #0xd0]\n"
+ "ldr x28, [x6, #0xd8]\n"
+ "smlal2 v14.4s, v28.8h, v20.8h\n"
+ "ldr d28, [x7, #0x68]\n"
+ "smlal2 v22.4s, v15.8h, v20.8h\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "smlal2 v17.4s, v31.8h, v20.8h\n"
+ "ldr d20, [x21, x4]\n"
+ "smlal v7.4s, v16.4h, v6.4h\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v27.4s, v31.4h, v6.4h\n"
+ "smlal v8.4s, v29.4h, v6.4h\n"
+ "ldr x27, [x6, #0xe0]\n"
+ "ldr x26, [x6, #0xe8]\n"
+ "smlal2 v24.4s, v16.8h, v19.8h\n"
+ "smlal v13.4s, v21.4h, v10.4h\n"
+ "ldr x25, [x6, #0xf0]\n"
+ "ldr x24, [x6, #0xf8]\n"
+ "smlal2 v14.4s, v16.8h, v6.8h\n"
+ "ldr d16, [x7, #0x70]\n"
+ "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "smlal2 v17.4s, v29.8h, v6.8h\n"
+ "ldr d6, [x20, x4]\n"
+ "smlal v7.4s, v21.4h, v19.4h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v27.4s, v29.4h, v19.4h\n"
+ "smlal v8.4s, v1.4h, v19.4h\n"
+ "ldr x23, [x6, #0x100]\n"
+ "ldr x22, [x6, #0x108]\n"
+ "smlal2 v24.4s, v21.8h, v10.8h\n"
+ "smlal v13.4s, v11.4h, v5.4h\n"
+ "ldr x21, [x6, #0x110]\n"
+ "ldr x20, [x6, #0x118]\n"
+ "smlal2 v14.4s, v21.8h, v19.8h\n"
+ "ldr d21, [x7, #0x78]\n"
+ "smlal2 v22.4s, v29.8h, v19.8h\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "smlal2 v17.4s, v1.8h, v19.8h\n"
+ "ldr d19, [x12, x4]\n"
+ "smlal v7.4s, v9.4h, v10.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v27.4s, v1.4h, v10.4h\n"
+ "smlal v8.4s, v18.4h, v10.4h\n"
+ "tst x2, #0x7\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "ldr d11, [x7, #0x80]\n"
+ "smlal v13.4s, v15.4h, v23.4h\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "smlal2 v14.4s, v9.8h, v10.8h\n"
+ "ldr d9, [x11, x4]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal2 v17.4s, v18.8h, v10.8h\n"
+ "ldr d10, [x10, x4]\n"
+ "smlal v7.4s, v15.4h, v5.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v27.4s, v4.4h, v5.4h\n"
+ "smlal v8.4s, v3.4h, v5.4h\n"
+ "smlal2 v24.4s, v15.8h, v23.8h\n"
+ "smlal v13.4s, v31.4h, v30.4h\n"
+ "smlal2 v14.4s, v15.8h, v5.8h\n"
+ "ldr d15, [x7, #0x88]\n"
+ "smlal2 v22.4s, v4.8h, v5.8h\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "smlal2 v17.4s, v3.8h, v5.8h\n"
+ "ldr d5, [x9, x4]\n"
+ "smlal v7.4s, v31.4h, v23.4h\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "smlal v27.4s, v3.4h, v23.4h\n"
+ "smlal v8.4s, v0.4h, v23.4h\n"
+ "smlal2 v24.4s, v31.8h, v30.8h\n"
+ "smlal v13.4s, v29.4h, v28.4h\n"
+ "smlal2 v14.4s, v31.8h, v23.8h\n"
+ "ldr d31, [x7, #0x90]\n"
+ "smlal2 v22.4s, v3.8h, v23.8h\n"
+ "usubl v31.8h, v31.8b, v2.8b\n"
+ "smlal2 v17.4s, v0.8h, v23.8h\n"
+ "ldr d23, [x28, x4]\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr x23, [x2, #0x100]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "ldr d1, [x0, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v13.4s, v31.8h, v3.8h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x3]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x15, [x2, #0x108]\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d2, [x0, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v13.4s, v30.8h, v4.8h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x3]\n"
+ "smlal v27.4s, v0.4h, v30.4h\n"
+ "smlal v8.4s, v20.4h, v30.4h\n"
+ "smlal2 v24.4s, v29.8h, v28.8h\n"
+ "smlal v13.4s, v1.4h, v16.4h\n"
+ "smlal2 v14.4s, v29.8h, v30.8h\n"
+ "ldr d29, [x7, #0x98]\n"
+ "smlal2 v22.4s, v0.8h, v30.8h\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "smlal2 v17.4s, v20.8h, v30.8h\n"
+ "ldr d30, [x27, x4]\n"
+ "smlal v7.4s, v1.4h, v28.4h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "ldr x21, [x2, #0x110]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "ldr d3, [x0, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v13.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x3]\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x3]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x2, #0x118]\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x0, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v13.4s, v25.8h, v1.8h\n"
- "smlal2 v19.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d0, [x0, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal2 v19.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "ldr d1, [x0, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v27.8h, v3.8h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d2, [x0, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v13.4s, v23.8h, v4.8h\n"
- "smlal2 v19.4s, v23.8h, v3.8h\n"
- "ldr d23, [x10, x3]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr d3, [x0, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "ldr d31, [x22, x3]\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x3]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "ldr d4, [x0, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal v27.4s, v20.4h, v28.4h\n"
+ "smlal v8.4s, v6.4h, v28.4h\n"
+ "smlal2 v24.4s, v1.8h, v16.8h\n"
+ "smlal v13.4s, v4.4h, v21.4h\n"
+ "smlal2 v14.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x7, #0xa0]\n"
+ "smlal2 v22.4s, v20.8h, v28.8h\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "smlal2 v17.4s, v6.8h, v28.8h\n"
+ "ldr d28, [x26, x4]\n"
+ "smlal v7.4s, v18.4h, v16.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x3]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "ldr d0, [x0, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v13.4s, v26.8h, v2.8h\n"
- "smlal2 v19.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x3]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d1, [x0, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d2, [x0, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "ldr d3, [x0, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v13.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x3]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x0, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v13.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x3]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v13.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x3]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v13.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x3]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v2.8h\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "add x3, x3, #0x8\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v31.16b, v11.16b, v21.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "smlal2 v13.4s, v28.8h, v4.8h\n"
- "smlal2 v19.4s, v28.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "and v17.16b, v13.16b, v10.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v20.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "and v18.16b, v8.16b, v21.16b\n"
- "sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "and v31.16b, v6.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v17.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v27.16b, v19.16b, v10.16b\n"
+ "smlal v27.4s, v6.4h, v16.4h\n"
+ "smlal v8.4s, v19.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v21.8h\n"
+ "ldr d4, [x7, #0xa8]\n"
+ "smlal v13.4s, v3.4h, v11.4h\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "smlal2 v14.4s, v18.8h, v16.8h\n"
+ "ldr d18, [x25, x4]\n"
+ "smlal2 v22.4s, v6.8h, v16.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v17.4s, v19.8h, v16.8h\n"
+ "ldr d16, [x24, x4]\n"
+ "smlal v7.4s, v3.4h, v21.4h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v27.4s, v9.4h, v21.4h\n"
+ "smlal v8.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v3.8h, v11.8h\n"
+ "smlal v13.4s, v0.4h, v15.4h\n"
+ "smlal2 v14.4s, v3.8h, v21.8h\n"
+ "ldr d3, [x7, #0xb0]\n"
+ "smlal2 v22.4s, v9.8h, v21.8h\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "smlal2 v17.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x23, x4]\n"
+ "smlal v7.4s, v0.4h, v11.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v27.4s, v10.4h, v11.4h\n"
+ "smlal v8.4s, v5.4h, v11.4h\n"
+ "smlal2 v24.4s, v0.8h, v15.8h\n"
+ "smlal v13.4s, v20.4h, v31.4h\n"
+ "smlal2 v14.4s, v0.8h, v11.8h\n"
+ "ldr d0, [x7, #0xb8]\n"
+ "smlal2 v22.4s, v10.8h, v11.8h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "smlal2 v17.4s, v5.8h, v11.8h\n"
+ "ldr d11, [x22, x4]\n"
+ "smlal v7.4s, v20.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v27.4s, v5.4h, v15.4h\n"
+ "smlal v8.4s, v23.4h, v15.4h\n"
+ "smlal2 v24.4s, v20.8h, v31.8h\n"
+ "smlal v13.4s, v6.4h, v29.4h\n"
+ "smlal2 v14.4s, v20.8h, v15.8h\n"
+ "ldr d20, [x7, #0xc0]\n"
+ "smlal2 v22.4s, v5.8h, v15.8h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal2 v17.4s, v23.8h, v15.8h\n"
+ "ldr d15, [x21, x4]\n"
+ "smlal v7.4s, v6.4h, v31.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v27.4s, v23.4h, v31.4h\n"
+ "smlal v8.4s, v30.4h, v31.4h\n"
+ "smlal2 v24.4s, v6.8h, v29.8h\n"
+ "smlal v13.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x4]\n"
+ "smlal2 v22.4s, v23.8h, v31.8h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v31.8h\n"
+ "ldr q31, [x8, #0x0]\n"
+ "smlal v7.4s, v19.4h, v29.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v27.4s, v30.4h, v29.4h\n"
+ "smlal v8.4s, v28.4h, v29.4h\n"
+ "smlal2 v24.4s, v9.8h, v1.8h\n"
+ "ldr q9, [x17, #0x0]\n"
+ "smlal v13.4s, v10.4h, v4.4h\n"
+ "smlal2 v14.4s, v19.8h, v29.8h\n"
+ "ldr q19, [x8, #0x10]\n"
+ "smlal2 v22.4s, v30.8h, v29.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v17.4s, v28.8h, v29.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal v27.4s, v18.4h, v1.4h\n"
+ "smlal v8.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v4.8h\n"
+ "smlal v13.4s, v5.4h, v3.4h\n"
+ "smlal2 v14.4s, v10.8h, v1.8h\n"
+ "smlal2 v22.4s, v18.8h, v1.8h\n"
+ "smlal2 v17.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v5.4h, v4.4h\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v8.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v5.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v0.4h\n"
+ "smlal2 v14.4s, v5.8h, v4.8h\n"
+ "smlal2 v22.4s, v16.8h, v4.8h\n"
+ "smlal2 v17.4s, v21.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v24.4s, v23.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v20.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "smlal2 v14.4s, v23.8h, v3.8h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "and v21.16b, v13.16b, v9.16b\n"
+ "smlal2 v17.4s, v11.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "smlal v27.4s, v11.4h, v0.4h\n"
+ "smlal v8.4s, v15.4h, v0.4h\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "smlal2 v24.4s, v30.8h, v20.8h\n"
+ "smlal2 v14.4s, v30.8h, v0.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+ "smlal2 v22.4s, v11.8h, v0.8h\n"
+ "smlal2 v17.4s, v15.8h, v0.8h\n"
+ "and v16.16b, v24.16b, v29.16b\n"
+ "smlal v7.4s, v28.4h, v20.4h\n"
+ "smlal v27.4s, v15.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+ "smlal v8.4s, v6.4h, v20.4h\n"
+ "smlal2 v14.4s, v28.8h, v20.8h\n"
+ "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+ "smlal2 v22.4s, v15.8h, v20.8h\n"
+ "smlal2 v17.4s, v6.8h, v20.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v9.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+ "and v20.16b, v27.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "and v3.16b, v8.16b, v9.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v19.16b, v22.16b, v29.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v30.16b, v17.16b, v29.16b\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v25.16b, v7.16b, v10.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v17.16b, v5.16b, v10.16b\n"
- "sqadd v20.4s, v20.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v27.4s\n"
- "srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "srshl v6.4s, v6.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v3.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v9.4s\n"
+ "srshl v7.4s, v7.4s, v9.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v29.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v29.4s\n"
"sqxtn v8.4h, v8.4s\n"
- "srshl v5.4s, v5.4s, v10.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v11.8h, v13.4s\n"
- "sqxtn2 v20.8h, v19.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v11.8h, v11.8h, v15.8h\n"
- "sqadd v20.8h, v20.8h, v15.8h\n"
- "sqadd v8.8h, v8.8h, v15.8h\n"
- "sqadd v6.8h, v6.8h, v15.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v20.8h, v20.8h, v14.8h\n"
- "smax v8.8h, v8.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v11.8h, v11.8h, v12.8h\n"
- "smin v20.8h, v20.8h, v12.8h\n"
- "smin v8.8h, v8.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d11, [x7, x1]\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x16, x5]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str d7, [x15, x5]\n"
"uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d20, [x8, x1]\n"
- "str d8, [x17, x1]\n"
- "str d6, [x16, x1]\n"
- "add x1, x1, #0x8\n"
+ "str d27, [x14, x5]\n"
+ "str d8, [x13, x5]\n"
+ "add x5, x5, #0x8\n"
"beq 124f\n"
- "add x0, x0, #0xc8\n"
+ "add x7, x7, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
- "ld1 { v11.4s }, [x13], #0x10\n"
- "tbz x4, #1, 4f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x2, #2, 5f\n"
+ "ld1 { v13.4s }, [x20], #0x10\n"
+ "tbz x2, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
- "ld1 { v11.d }[0], [x13], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v11.s }[2], [x13]\n"
+ "tbz x2, #1, 6f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v11.s }[0], [x13]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v13.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x0, #0x0]\n"
- "ldr d1, [x0, #0x8]\n"
- "mov v20.16b, v11.16b\n"
- "mov v19.16b, v13.16b\n"
- "ldr d2, [x0, #0x10]\n"
- "ldr d3, [x0, #0x18]\n"
- "mov v8.16b, v11.16b\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
"mov v7.16b, v13.16b\n"
- "ldr d4, [x0, #0x20]\n"
- "ldp x10, x28, [x2, #0x0]\n"
- "mov v6.16b, v11.16b\n"
- "mov v5.16b, v13.16b\n"
- "ldp x27, x26, [x2, #0x10]\n"
- "ldp x25, x24, [x2, #0x20]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldp x23, x22, [x2, #0x30]\n"
- "ldp x21, x20, [x2, #0x40]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "add x10, x10, x3\n"
- "add x28, x28, x3\n"
- "add x27, x27, x3\n"
- "add x26, x26, x3\n"
- "add x25, x25, x3\n"
- "add x24, x24, x3\n"
- "add x23, x23, x3\n"
- "add x22, x22, x3\n"
- "add x21, x21, x3\n"
- "add x20, x20, x3\n"
- "tbz x4, #2, 9f\n"
- "ld1 { v31.s }[0], [x10], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 8f\n"
- "ld1 { v31.h }[2], [x10], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[6], [x10]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
- "ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "mov v14.16b, v24.16b\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "add x9, x9, x4\n"
+ "add x28, x28, x4\n"
+ "add x27, x27, x4\n"
+ "add x26, x26, x4\n"
+ "add x25, x25, x4\n"
+ "add x24, x24, x4\n"
+ "add x23, x23, x4\n"
+ "add x22, x22, x4\n"
+ "add x21, x21, x4\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 9f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v16.s }[0], [x28], #0x4\n"
+ "ld1 { v23.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v4.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v1.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 8f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v23.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v4.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v1.h }[2], [x22], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[6], [x9]\n"
+ "ld1 { v16.b }[6], [x28]\n"
+ "ld1 { v23.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v4.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v1.b }[6], [x22]\n"
+ "ld1 { v9.b }[6], [x21]\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[4], [x10]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
- "ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[4], [x9]\n"
+ "ld1 { v16.b }[4], [x28]\n"
+ "ld1 { v23.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v4.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v1.b }[4], [x22]\n"
+ "ld1 { v9.b }[4], [x21]\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
- "ld1 { v31.h }[0], [x10], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[2], [x10]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
- "ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x2, #1, 10f\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v16.h }[0], [x28], #0x2\n"
+ "ld1 { v23.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v4.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v1.h }[0], [x22], #0x2\n"
+ "ld1 { v9.h }[0], [x21], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[2], [x9]\n"
+ "ld1 { v16.b }[2], [x28]\n"
+ "ld1 { v23.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v4.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v9.b }[2], [x21]\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
- "ld1 { v31.b }[0], [x10]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
- "ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[0], [x9]\n"
+ "ld1 { v16.b }[0], [x28]\n"
+ "ld1 { v23.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v4.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v1.b }[0], [x22]\n"
+ "ld1 { v9.b }[0], [x21]\n"
+ "ld1 { v11.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "ldr x20, [x6, #0x50]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "ldr x20, [x2, #0x50]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
+ "add x20, x20, x4\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "add x20, x20, x3\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v19.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v13.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
+ "tbz x2, #2, 13f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 12f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "tbz x2, #1, 14f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x22, [x2, #0x58]\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "add x22, x22, x3\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal v8.4s, v5.4h, v29.4h\n"
+ "smlal2 v17.4s, v5.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "smlal v27.4s, v5.4h, v18.4h\n"
+ "smlal2 v22.4s, v5.8h, v18.8h\n"
+ "tbz x2, #2, 17f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 16f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x2, #1, 18f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x21, [x2, #0x60]\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "add x21, x21, x3\n"
- "tbz x4, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x6, #0x60]\n"
+ "smlal v8.4s, v10.4h, v18.4h\n"
+ "smlal2 v17.4s, v10.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 21f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 20f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x4, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "tbz x2, #1, 22f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x0, #0x28]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x20, [x2, #0x68]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x20, x20, x3\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "smlal2 v13.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v19.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ldr d6, [x7, #0x28]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v7.4s, v15.4h, v3.4h\n"
+ "smlal2 v14.4s, v15.8h, v3.8h\n"
+ "smlal v27.4s, v10.4h, v3.4h\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v23.4h, v6.4h\n"
+ "smlal2 v24.4s, v23.8h, v6.8h\n"
+ "smlal v7.4s, v30.4h, v6.4h\n"
+ "smlal2 v14.4s, v30.8h, v6.8h\n"
+ "smlal v27.4s, v11.4h, v6.4h\n"
+ "smlal2 v22.4s, v11.8h, v6.8h\n"
+ "tbz x2, #2, 25f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 24f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x2, #1, 26f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x0, #0x30]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x25, [x2, #0x70]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "add x25, x25, x3\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "smlal2 v13.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v24.s }[0], [x25], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[6], [x25]\n"
+ "ldr d4, [x7, #0x30]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "ldr x20, [x6, #0x70]\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "smlal2 v17.4s, v20.8h, v6.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v24.4s, v30.8h, v4.8h\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v14.4s, v28.8h, v4.8h\n"
+ "smlal v27.4s, v20.4h, v4.4h\n"
+ "smlal2 v22.4s, v20.8h, v4.8h\n"
+ "tbz x2, #2, 29f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 28f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[4], [x25]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v24.h }[0], [x25], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "tbz x2, #1, 30f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x0, #0x38]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x26, [x2, #0x78]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "add x26, x26, x3\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "smlal2 v13.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[6], [x26]\n"
+ "ldr d30, [x7, #0x38]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v28.4h, v30.4h\n"
+ "smlal2 v24.4s, v28.8h, v30.8h\n"
+ "smlal v7.4s, v5.4h, v30.4h\n"
+ "smlal2 v14.4s, v5.8h, v30.8h\n"
+ "smlal v27.4s, v23.4h, v30.4h\n"
+ "smlal2 v22.4s, v23.8h, v30.8h\n"
+ "tbz x2, #2, 33f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 32f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[4], [x26]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[2], [x26]\n"
+ "tbz x2, #1, 34f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[0], [x26]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x0, #0x40]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x23, [x2, #0x80]\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "add x23, x23, x3\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v13.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ldr d16, [x7, #0x40]\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "ldr x20, [x6, #0x80]\n"
+ "smlal v8.4s, v3.4h, v30.4h\n"
+ "smlal2 v17.4s, v3.8h, v30.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v5.4h, v16.4h\n"
+ "smlal2 v24.4s, v5.8h, v16.8h\n"
+ "smlal v7.4s, v10.4h, v16.4h\n"
+ "smlal2 v14.4s, v10.8h, v16.8h\n"
+ "smlal v27.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "tbz x2, #2, 37f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 36f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "tbz x2, #1, 38f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x0, #0x48]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x24, [x2, #0x88]\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "add x24, x24, x3\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "smlal2 v13.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ldr d1, [x7, #0x48]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal v8.4s, v6.4h, v16.4h\n"
+ "smlal2 v17.4s, v6.8h, v16.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v10.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v9.8h, v1.8h\n"
+ "smlal v27.4s, v6.4h, v1.4h\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "tbz x2, #2, 41f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 40f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x2, #1, 42f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x0, #0x50]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x15, [x2, #0x90]\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "add x15, x15, x3\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v13.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v19.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ldr d28, [x7, #0x50]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "ldr x20, [x6, #0x90]\n"
+ "smlal v8.4s, v18.4h, v1.4h\n"
+ "smlal2 v17.4s, v18.8h, v1.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v11.4h, v28.4h\n"
+ "smlal2 v24.4s, v11.8h, v28.8h\n"
+ "smlal v7.4s, v20.4h, v28.4h\n"
+ "smlal2 v14.4s, v20.8h, v28.8h\n"
+ "tbz x2, #2, 45f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 44f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "tbz x2, #1, 46f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x21, [x2, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x3\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x20, [x6, #0x98]\n"
+ "smlal v27.4s, v30.4h, v28.4h\n"
+ "smlal2 v22.4s, v30.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 49f\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 48f\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x2, #1, 50f\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x0, #0x58]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x14, [x2, #0xa0]\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "add x14, x14, x3\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v13.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ldr d0, [x7, #0x58]\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "ldr x20, [x6, #0xa0]\n"
+ "smlal v8.4s, v19.4h, v28.4h\n"
+ "smlal2 v17.4s, v19.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v20.4h, v0.4h\n"
+ "smlal2 v24.4s, v20.8h, v0.8h\n"
+ "smlal v7.4s, v23.4h, v0.4h\n"
+ "smlal2 v14.4s, v23.8h, v0.8h\n"
+ "smlal v27.4s, v19.4h, v0.4h\n"
+ "smlal2 v22.4s, v19.8h, v0.8h\n"
+ "tbz x2, #2, 53f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 52f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "tbz x2, #1, 54f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x0, #0x60]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x13, [x2, #0xa8]\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "add x13, x13, x3\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v13.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "tbz x4, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ldr d10, [x7, #0x60]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x20, [x6, #0xa8]\n"
+ "smlal v8.4s, v9.4h, v0.4h\n"
+ "smlal2 v17.4s, v9.8h, v0.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v23.4h, v10.4h\n"
+ "smlal2 v24.4s, v23.8h, v10.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v14.4s, v3.8h, v10.8h\n"
+ "smlal v27.4s, v9.4h, v10.4h\n"
+ "smlal2 v22.4s, v9.8h, v10.8h\n"
+ "tbz x2, #2, 57f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 56f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "tbz x2, #1, 58f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x0, #0x68]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x12, [x2, #0xb0]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x12, x12, x3\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v13.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ldr d28, [x7, #0x68]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal2 v17.4s, v20.8h, v10.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v3.4h, v28.4h\n"
+ "smlal2 v24.4s, v3.8h, v28.8h\n"
+ "smlal v7.4s, v6.4h, v28.4h\n"
+ "smlal2 v14.4s, v6.8h, v28.8h\n"
+ "smlal v27.4s, v20.4h, v28.4h\n"
+ "smlal2 v22.4s, v20.8h, v28.8h\n"
+ "tbz x2, #2, 61f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 60f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "tbz x2, #1, 62f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x0, #0x70]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x20, [x2, #0xb8]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x20, x20, x3\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v13.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ldr d23, [x7, #0x70]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal v8.4s, v5.4h, v28.4h\n"
+ "smlal2 v17.4s, v5.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v6.4h, v23.4h\n"
+ "smlal2 v24.4s, v6.8h, v23.8h\n"
+ "smlal v7.4s, v18.4h, v23.4h\n"
+ "smlal2 v14.4s, v18.8h, v23.8h\n"
+ "smlal v27.4s, v5.4h, v23.4h\n"
+ "smlal2 v22.4s, v5.8h, v23.8h\n"
+ "tbz x2, #2, 65f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 64f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x4, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x2, #1, 66f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x0, #0x78]\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x11, [x2, #0xc0]\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "add x11, x11, x3\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v13.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "tbz x4, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ldr d4, [x7, #0x78]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "ldr x20, [x6, #0xc0]\n"
+ "smlal v8.4s, v29.4h, v23.4h\n"
+ "smlal2 v17.4s, v29.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v24.4s, v30.8h, v4.8h\n"
+ "smlal v7.4s, v19.4h, v4.4h\n"
+ "smlal2 v14.4s, v19.8h, v4.8h\n"
+ "tbz x2, #2, 69f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 68f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "tbz x2, #1, 70f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr x10, [x2, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x10, x10, x3\n"
- "tbz x4, #2, 73f\n"
- "ld1 { v23.s }[0], [x10], #0x4\n"
- "tbz x4, #1, 72f\n"
- "ld1 { v23.h }[2], [x10], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[6], [x10]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x6, #0xc8]\n"
+ "smlal v27.4s, v18.4h, v4.4h\n"
+ "smlal2 v22.4s, v18.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 73f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 72f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[4], [x10]\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 74f\n"
- "ld1 { v23.h }[0], [x10], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[2], [x10]\n"
+ "tbz x2, #1, 74f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[0], [x10]\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x0, #0x80]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x22, [x2, #0xd0]\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "add x22, x22, x3\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v13.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "tbz x4, #2, 77f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 76f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "ldr d23, [x7, #0x80]\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xd0]\n"
+ "smlal v8.4s, v1.4h, v4.4h\n"
+ "smlal2 v17.4s, v1.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v19.4h, v23.4h\n"
+ "smlal2 v24.4s, v19.8h, v23.8h\n"
+ "smlal v7.4s, v9.4h, v23.4h\n"
+ "smlal2 v14.4s, v9.8h, v23.8h\n"
+ "smlal v27.4s, v1.4h, v23.4h\n"
+ "smlal2 v22.4s, v1.8h, v23.8h\n"
+ "tbz x2, #2, 77f\n"
+ "ld1 { v4.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 76f\n"
+ "ld1 { v4.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 78f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x2, #1, 78f\n"
+ "ld1 { v4.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x0, #0x88]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x28, [x2, #0xd8]\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "add x28, x28, x3\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v13.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ldr d30, [x7, #0x88]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "ldr x20, [x6, #0xd8]\n"
+ "smlal v8.4s, v4.4h, v23.4h\n"
+ "smlal2 v17.4s, v4.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v9.4h, v30.4h\n"
+ "smlal2 v24.4s, v9.8h, v30.8h\n"
+ "smlal v7.4s, v20.4h, v30.4h\n"
+ "smlal2 v14.4s, v20.8h, v30.8h\n"
+ "smlal v27.4s, v4.4h, v30.4h\n"
+ "smlal2 v22.4s, v4.8h, v30.8h\n"
+ "tbz x2, #2, 81f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 80f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x2, #1, 82f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x0, #0x90]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x27, [x2, #0xe0]\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "add x27, x27, x3\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v13.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ldr d3, [x7, #0x90]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ldr x20, [x6, #0xe0]\n"
+ "smlal v8.4s, v21.4h, v30.4h\n"
+ "smlal2 v17.4s, v21.8h, v30.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v20.4h, v3.4h\n"
+ "smlal2 v24.4s, v20.8h, v3.8h\n"
+ "smlal v7.4s, v5.4h, v3.4h\n"
+ "smlal2 v14.4s, v5.8h, v3.8h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "tbz x2, #2, 85f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 84f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "tbz x2, #1, 86f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x0, #0x98]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x26, [x2, #0xe8]\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "add x26, x26, x3\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v13.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "tbz x4, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ldr d19, [x7, #0x98]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "ldr x20, [x6, #0xe8]\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal2 v17.4s, v30.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v5.4h, v19.4h\n"
+ "smlal2 v24.4s, v5.8h, v19.8h\n"
+ "smlal v7.4s, v29.4h, v19.4h\n"
+ "smlal2 v14.4s, v29.8h, v19.8h\n"
+ "smlal v27.4s, v30.4h, v19.4h\n"
+ "smlal2 v22.4s, v30.8h, v19.8h\n"
+ "tbz x2, #2, 89f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 88f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x4, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "tbz x2, #1, 90f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x0, #0xa0]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "ldr x25, [x2, #0xf0]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x25, x25, x3\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v13.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v0.8h\n"
- "tbz x4, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
- "tbz x4, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ldr d23, [x7, #0xa0]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xf0]\n"
+ "smlal v8.4s, v20.4h, v19.4h\n"
+ "smlal2 v17.4s, v20.8h, v19.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v18.4h, v23.4h\n"
+ "smlal2 v24.4s, v18.8h, v23.8h\n"
+ "smlal v7.4s, v1.4h, v23.4h\n"
+ "smlal2 v14.4s, v1.8h, v23.8h\n"
+ "tbz x2, #2, 93f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 92f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x4, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "tbz x2, #1, 94f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ushll v25.8h, v25.8b, #0x0\n"
- "ldr x24, [x2, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x3\n"
- "tbz x4, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x6, #0xf8]\n"
+ "smlal v27.4s, v10.4h, v23.4h\n"
+ "smlal2 v22.4s, v10.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 97f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 96f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x4, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "tbz x2, #1, 98f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x0, #0xa8]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr x23, [x2, #0x100]\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "add x23, x23, x3\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v13.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ldr d5, [x7, #0xa8]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "ldr x20, [x6, #0x100]\n"
+ "smlal v8.4s, v18.4h, v23.4h\n"
+ "smlal2 v17.4s, v18.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v1.4h, v5.4h\n"
+ "smlal2 v24.4s, v1.8h, v5.8h\n"
+ "smlal v7.4s, v4.4h, v5.4h\n"
+ "smlal2 v14.4s, v4.8h, v5.8h\n"
+ "smlal v27.4s, v18.4h, v5.4h\n"
+ "smlal2 v22.4s, v18.8h, v5.8h\n"
+ "tbz x2, #2, 101f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 100f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x4, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x2, #1, 102f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x0, #0xb0]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr x15, [x2, #0x108]\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "add x15, x15, x3\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v13.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "tbz x4, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ldr d18, [x7, #0xb0]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "ldr x20, [x6, #0x108]\n"
+ "smlal v8.4s, v9.4h, v5.4h\n"
+ "smlal2 v17.4s, v9.8h, v5.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v21.4h, v18.4h\n"
+ "smlal2 v14.4s, v21.8h, v18.8h\n"
+ "smlal v27.4s, v9.4h, v18.4h\n"
+ "smlal2 v22.4s, v9.8h, v18.8h\n"
+ "tbz x2, #2, 105f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 104f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x4, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "tbz x2, #1, 106f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x0, #0xb8]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr x21, [x2, #0x110]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x21, x21, x3\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v13.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ldr d11, [x7, #0xb8]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "ldr x20, [x6, #0x110]\n"
+ "smlal v8.4s, v5.4h, v18.4h\n"
+ "smlal2 v17.4s, v5.8h, v18.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v21.4h, v11.4h\n"
+ "smlal2 v24.4s, v21.8h, v11.8h\n"
+ "smlal v7.4s, v30.4h, v11.4h\n"
+ "smlal2 v14.4s, v30.8h, v11.8h\n"
+ "smlal v27.4s, v5.4h, v11.4h\n"
+ "smlal2 v22.4s, v5.8h, v11.8h\n"
+ "tbz x2, #2, 109f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 108f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x4, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "tbz x2, #1, 110f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x0, #0xc0]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr x20, [x2, #0x118]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x20, x20, x3\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v13.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d16, [x7, #0xc0]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "ldr x20, [x6, #0x118]\n"
+ "smlal v8.4s, v18.4h, v11.4h\n"
+ "smlal2 v17.4s, v18.8h, v11.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v16.4h\n"
+ "smlal2 v24.4s, v30.8h, v16.8h\n"
+ "smlal v7.4s, v20.4h, v16.4h\n"
+ "smlal2 v14.4s, v20.8h, v16.8h\n"
+ "smlal v27.4s, v18.4h, v16.4h\n"
+ "smlal2 v22.4s, v18.8h, v16.8h\n"
+ "tbz x2, #2, 113f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 112f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x4, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x2, #1, 114f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "tbz x4, #2, 117f\n"
- "ld1 { v18.4s }, [x6], #0x10\n"
- "ld1 { v21.4s }, [x5], #0x10\n"
- "tbz x4, #1, 116f\n"
- "ld1 { v16.d }[0], [x6], #0x8\n"
- "ld1 { v10.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v16.s }[2], [x6]\n"
- "ld1 { v10.s }[2], [x5]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v8.4s, v21.4h, v16.4h\n"
+ "smlal2 v17.4s, v21.8h, v16.8h\n"
+ "tbz x2, #2, 117f\n"
+ "ld1 { v16.4s }, [x8], #0x10\n"
+ "ld1 { v21.4s }, [x17], #0x10\n"
+ "tbz x2, #1, 116f\n"
+ "ld1 { v18.d }[0], [x8], #0x8\n"
+ "ld1 { v0.d }[0], [x17], #0x8\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v18.s }[2], [x8]\n"
+ "ld1 { v0.s }[2], [x17]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v16.s }[0], [x6]\n"
- "ld1 { v10.s }[0], [x5]\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v18.s }[0], [x8]\n"
+ "ld1 { v0.s }[0], [x17]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 118f\n"
- "ld1 { v18.d }[0], [x6], #0x8\n"
- "ld1 { v21.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v18.s }[2], [x6]\n"
- "ld1 { v21.s }[2], [x5]\n"
+ "tbz x2, #1, 118f\n"
+ "ld1 { v16.d }[0], [x8], #0x8\n"
+ "ld1 { v21.d }[0], [x17], #0x8\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v16.s }[2], [x8]\n"
+ "ld1 { v21.s }[2], [x17]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v18.s }[0], [x6]\n"
- "ld1 { v21.s }[0], [x5]\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v16.s }[0], [x8]\n"
+ "ld1 { v21.s }[0], [x17]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "and v31.16b, v11.16b, v21.16b\n"
- "add x7, x7, x1\n"
- "add x8, x8, x1\n"
"sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "add x17, x17, x1\n"
- "add x16, x16, x1\n"
- "and v17.16b, v13.16b, v10.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v11.4s, v11.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v20.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v16.4s\n"
- "and v18.16b, v8.16b, v21.16b\n"
+ "and v5.16b, v13.16b, v21.16b\n"
+ "add x16, x16, x5\n"
+ "add x15, x15, x5\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "add x14, x14, x5\n"
+ "add x13, x13, x5\n"
+ "and v2.16b, v24.16b, v0.16b\n"
"sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "and v31.16b, v6.16b, v21.16b\n"
- "sqrdmulh v5.4s, v5.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v17.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v27.16b, v19.16b, v10.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v25.16b, v7.16b, v10.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v16.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v21.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "and v20.16b, v27.16b, v21.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v18.4s\n"
+ "and v31.16b, v8.16b, v21.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v0.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v11.16b, v22.16b, v0.16b\n"
"sshr v31.4s, v31.4s, #0x1f\n"
- "and v17.16b, v5.16b, v10.16b\n"
- "sqadd v20.4s, v20.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v31.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v27.4s\n"
+ "and v10.16b, v17.16b, v0.16b\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v31.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v21.4s\n"
+ "srshl v7.4s, v7.4s, v21.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v11.4s\n"
"srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "srshl v6.4s, v6.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqxtn v11.4h, v11.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqadd v17.4s, v17.4s, v10.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v0.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
"sqxtn v8.4h, v8.4s\n"
- "srshl v5.4s, v5.4s, v10.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v11.8h, v13.4s\n"
- "sqxtn2 v20.8h, v19.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v11.8h, v11.8h, v15.8h\n"
- "sqadd v20.8h, v20.8h, v15.8h\n"
- "sqadd v8.8h, v8.8h, v15.8h\n"
- "sqadd v6.8h, v6.8h, v15.8h\n"
- "smax v11.8h, v11.8h, v14.8h\n"
- "smax v20.8h, v20.8h, v14.8h\n"
- "smax v8.8h, v8.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v11.8h, v11.8h, v12.8h\n"
- "smin v20.8h, v20.8h, v12.8h\n"
- "smin v8.8h, v8.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "tbz x4, #2, 121f\n"
- "st1 { v11.s }[0], [x7], #0x4\n"
- "st1 { v20.s }[0], [x8], #0x4\n"
- "st1 { v8.s }[0], [x17], #0x4\n"
- "st1 { v6.s }[0], [x16], #0x4\n"
- "tbz x4, #1, 120f\n"
- "st1 { v11.h }[2], [x7], #0x2\n"
- "st1 { v20.h }[2], [x8], #0x2\n"
- "st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v6.h }[2], [x16], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[6], [x7], #0x1\n"
- "st1 { v20.b }[6], [x8], #0x1\n"
- "st1 { v8.b }[6], [x17], #0x1\n"
- "st1 { v6.b }[6], [x16], #0x1\n"
+ "tbz x2, #2, 121f\n"
+ "st1 { v13.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x15], #0x4\n"
+ "st1 { v27.s }[0], [x14], #0x4\n"
+ "st1 { v8.s }[0], [x13], #0x4\n"
+ "tbz x2, #1, 120f\n"
+ "st1 { v13.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x15], #0x2\n"
+ "st1 { v27.h }[2], [x14], #0x2\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x15], #0x1\n"
+ "st1 { v27.b }[6], [x14], #0x1\n"
+ "st1 { v8.b }[6], [x13], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[4], [x7], #0x1\n"
- "st1 { v20.b }[4], [x8], #0x1\n"
- "st1 { v8.b }[4], [x17], #0x1\n"
- "st1 { v6.b }[4], [x16], #0x1\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x15], #0x1\n"
+ "st1 { v27.b }[4], [x14], #0x1\n"
+ "st1 { v8.b }[4], [x13], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 122f\n"
- "st1 { v11.h }[0], [x7], #0x2\n"
- "st1 { v20.h }[0], [x8], #0x2\n"
- "st1 { v8.h }[0], [x17], #0x2\n"
- "st1 { v6.h }[0], [x16], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[2], [x7], #0x1\n"
- "st1 { v20.b }[2], [x8], #0x1\n"
- "st1 { v8.b }[2], [x17], #0x1\n"
- "st1 { v6.b }[2], [x16], #0x1\n"
+ "tbz x2, #1, 122f\n"
+ "st1 { v13.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x15], #0x2\n"
+ "st1 { v27.h }[0], [x14], #0x2\n"
+ "st1 { v8.h }[0], [x13], #0x2\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x15], #0x1\n"
+ "st1 { v27.b }[2], [x14], #0x1\n"
+ "st1 { v8.b }[2], [x13], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v11.b }[0], [x7], #0x1\n"
- "st1 { v20.b }[0], [x8], #0x1\n"
- "st1 { v8.b }[0], [x17], #0x1\n"
- "st1 { v6.b }[0], [x16], #0x1\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x15], #0x1\n"
+ "st1 { v27.b }[0], [x14], #0x1\n"
+ "st1 { v8.b }[0], [x13], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index cf655cbe78..7b0b414517 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,16 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const
-);
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4419048793..89253ba670 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -91,1072 +91,1072 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x7, x6, #0x3\n"
+ "lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v24.16b }, [x20]\n"
+ "ld1r { v14.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
- "mov x8, #0x0\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x12, x11, [x22, #0x0]\n"
- "ldp x10, x9, [x22, #0x10]\n"
- "cbz x7, 3f\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "subs x7, x7, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d31, [x24, x8]\n"
- "ldr d30, [x23, x8]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d29, [x22, x8]\n"
- "ldr d28, [x21, x8]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr x20, [x16, #0x20]\n"
- "ldr d27, [x20, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x15, x15, #0x48\n"
- "subs x7, x7, #0x1\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
"add x13, x13, #0x20\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "ldr q13, [x28, #0x0]\n"
- "ldr q20, [x28, #0x10]\n"
- "add x28, x28, #0x20\n"
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "add x17, x17, #0x8\n"
- "str x28, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr d31, [x24, x8]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x23, x8]\n"
- "ldr d29, [x22, x8]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ldr d28, [x21, x8]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ldr d27, [x20, x8]\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q17, [x14, #0x0]\n"
- "ldr q22, [x13, #0x0]\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr q23, [x14, #0x10]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x10]\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "ldr d29, [x20, x8]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "ldr x27, [x16, #0x40]\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "ldr x24, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x28, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "ldr x20, [x16, #0x78]\n"
- "tst x6, #0x7\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "add x14, x14, #0x20\n"
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
"add x13, x13, #0x20\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr d31, [x27, x8]\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "ldr d30, [x26, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "ldr d28, [x24, x8]\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x25, x8]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "ldr d31, [x23, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x22, x8]\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "ldr d29, [x21, x8]\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "ldr d28, [x20, x8]\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "add x8, x8, #0x8\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x12, x17]\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d9, [x11, x17]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str d16, [x10, x17]\n"
- "str d25, [x9, x17]\n"
- "add x17, x17, #0x8\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x15, x15, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x6, #2, 5f\n"
- "ld1 { v13.4s }, [x28], #0x10\n"
- "tbz x6, #1, 4f\n"
- "ld1 { v20.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[2], [x28]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v20.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x6, #1, 6f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[2], [x28]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 7f\n"
- "ld1 { v13.s }[0], [x28]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x15, #0x0]\n"
- "ldr d1, [x15, #0x8]\n"
- "mov v9.16b, v13.16b\n"
- "mov v18.16b, v20.16b\n"
- "ldr d2, [x15, #0x10]\n"
- "ldr d3, [x15, #0x18]\n"
- "mov v16.16b, v13.16b\n"
- "mov v26.16b, v20.16b\n"
- "ldr d4, [x15, #0x20]\n"
- "ldr d5, [x15, #0x28]\n"
- "mov v25.16b, v13.16b\n"
- "mov v10.16b, v20.16b\n"
- "ldr d6, [x15, #0x30]\n"
- "ldr d7, [x15, #0x38]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x15, #0x40]\n"
- "ldp x24, x23, [x16, #0x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldp x22, x21, [x16, #0x10]\n"
- "ldr x20, [x16, #0x20]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v5.8h, v5.8b, v15.8b\n"
- "ssubl v6.8h, v6.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v8.8h, v8.8b, v15.8b\n"
- "add x24, x24, x8\n"
- "add x23, x23, x8\n"
- "add x22, x22, x8\n"
- "add x21, x21, x8\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 9f\n"
- "ld1 { v31.s }[0], [x24], #0x4\n"
- "ld1 { v30.s }[0], [x23], #0x4\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 8f\n"
- "ld1 { v31.h }[2], [x24], #0x2\n"
- "ld1 { v30.h }[2], [x23], #0x2\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[6], [x24]\n"
- "ld1 { v30.b }[6], [x23]\n"
- "ld1 { v29.b }[6], [x22]\n"
- "ld1 { v28.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[4], [x24]\n"
- "ld1 { v30.b }[4], [x23]\n"
- "ld1 { v29.b }[4], [x22]\n"
- "ld1 { v28.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x6, #1, 10f\n"
- "ld1 { v31.h }[0], [x24], #0x2\n"
- "ld1 { v30.h }[0], [x23], #0x2\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[2], [x24]\n"
- "ld1 { v30.b }[2], [x23]\n"
- "ld1 { v29.b }[2], [x22]\n"
- "ld1 { v28.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 11f\n"
- "ld1 { v31.b }[0], [x24]\n"
- "ld1 { v30.b }[0], [x23]\n"
- "ld1 { v29.b }[0], [x22]\n"
- "ld1 { v28.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v20.4s, v31.8h, v4.8h\n"
- "ldr x21, [x16, #0x28]\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v24.8b\n"
- "add x21, x21, x8\n"
- "usubl v29.8h, v29.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v26.4s, v31.8h, v1.8h\n"
- "smlal v25.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v20.4s, v30.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v24.8b\n"
- "smlal v9.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "smlal v25.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "tbz x6, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x6, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "ldr x20, [x16, #0x30]\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v20.4s, v27.8h, v7.8h\n"
- "add x20, x20, x8\n"
- "smlal v9.4s, v27.4h, v6.4h\n"
- "smlal2 v18.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v26.4s, v27.8h, v4.8h\n"
- "smlal v25.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "tbz x6, #2, 17f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 16f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x6, #1, 18f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 19f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x28, [x16, #0x38]\n"
- "smlal v25.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "add x28, x28, x8\n"
- "tbz x6, #2, 21f\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "tbz x6, #1, 20f\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[6], [x28]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[4], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x6, #1, 22f\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[2], [x28]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 23f\n"
- "ld1 { v28.b }[0], [x28]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x27, [x16, #0x40]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v0.4h\n"
- "smlal2 v18.4s, v28.8h, v0.8h\n"
- "add x27, x27, x8\n"
- "tbz x6, #2, 25f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x6, #1, 24f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x6, #1, 26f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 27f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x26, [x16, #0x48]\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v1.4h\n"
- "smlal2 v18.4s, v31.8h, v1.8h\n"
- "add x26, x26, x8\n"
- "tbz x6, #2, 29f\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "tbz x6, #1, 28f\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[6], [x26]\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[4], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x6, #1, 30f\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[2], [x26]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 31f\n"
- "ld1 { v30.b }[0], [x26]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x25, [x16, #0x50]\n"
- "smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v20.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v7.4h\n"
- "smlal2 v18.4s, v30.8h, v7.8h\n"
- "add x25, x25, x8\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "smlal v25.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "tbz x6, #2, 33f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "tbz x6, #1, 32f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x6, #1, 34f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 35f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x24, [x16, #0x58]\n"
- "smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v20.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v26.4s, v29.8h, v0.8h\n"
- "add x24, x24, x8\n"
- "tbz x6, #2, 37f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "tbz x6, #1, 36f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x6, #1, 38f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 39f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "ldr x23, [x16, #0x60]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v18.4s, v28.8h, v5.8h\n"
- "smlal v25.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "add x23, x23, x8\n"
- "tbz x6, #2, 41f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "tbz x6, #1, 40f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[6], [x23]\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[4], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x6, #1, 42f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[2], [x23]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 43f\n"
- "ld1 { v31.b }[0], [x23]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v24.8b\n"
- "ldr x22, [x16, #0x68]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v26.4s, v31.8h, v3.8h\n"
- "add x22, x22, x8\n"
- "tbz x6, #2, 45f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "tbz x6, #1, 44f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x6, #1, 46f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 47f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v24.8b\n"
- "ldr x21, [x16, #0x70]\n"
- "smlal v9.4s, v30.4h, v8.4h\n"
- "smlal2 v18.4s, v30.8h, v8.8h\n"
- "smlal v25.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "add x21, x21, x8\n"
- "tbz x6, #2, 49f\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "tbz x6, #1, 48f\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[6], [x21]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[4], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x6, #1, 50f\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[2], [x21]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 51f\n"
- "ld1 { v29.b }[0], [x21]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v24.8b\n"
- "ldr x20, [x16, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v26.4s, v29.8h, v7.8h\n"
- "smlal v25.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "add x20, x20, x8\n"
- "tbz x6, #2, 53f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x6, #1, 52f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x6, #1, 54f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 55f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v24.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v26.4s, v28.8h, v8.8h\n"
- "smlal v25.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "tbz x6, #2, 57f\n"
- "ld1 { v17.4s }, [x14], #0x10\n"
- "ld1 { v22.4s }, [x13], #0x10\n"
- "tbz x6, #1, 56f\n"
- "ld1 { v23.d }[0], [x14], #0x8\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[2], [x14]\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v23.s }[0], [x14]\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x6, #1, 58f\n"
- "ld1 { v17.d }[0], [x14], #0x8\n"
- "ld1 { v22.d }[0], [x13], #0x8\n"
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[2], [x14]\n"
- "ld1 { v22.s }[2], [x13]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 59f\n"
- "ld1 { v17.s }[0], [x14]\n"
- "ld1 { v22.s }[0], [x13]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "and v21.16b, v13.16b, v22.16b\n"
- "add x12, x12, x17\n"
- "add x11, x11, x17\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "add x10, x10, x17\n"
- "add x9, x9, x17\n"
- "and v29.16b, v20.16b, v19.16b\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v16.4s, v16.4s, v17.4s\n"
- "sqrdmulh v25.4s, v25.4s, v17.4s\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v0.16b, v9.16b, v22.16b\n"
- "sqrdmulh v18.4s, v18.4s, v23.4s\n"
- "and v27.16b, v16.16b, v22.16b\n"
- "sqrdmulh v26.4s, v26.4s, v23.4s\n"
- "and v21.16b, v25.16b, v22.16b\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v7.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v29.16b, v10.16b, v19.16b\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v27.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v21.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "srshl v9.4s, v9.4s, v22.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "sqadd v10.4s, v10.4s, v29.4s\n"
- "srshl v20.4s, v20.4s, v19.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v18.4s, v18.4s, v19.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sqxtn v16.4h, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v10.4s, v10.4s, v19.4s\n"
- "sqxtn v25.4h, v25.4s\n"
- "sqxtn2 v13.8h, v20.4s\n"
- "sqxtn2 v9.8h, v18.4s\n"
- "sqxtn2 v16.8h, v26.4s\n"
- "sqxtn2 v25.8h, v10.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v9.8h, v9.8h, v14.8h\n"
- "sqadd v16.8h, v16.8h, v14.8h\n"
- "sqadd v25.8h, v25.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v16.8h, v16.8h, v12.8h\n"
- "smax v25.8h, v25.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v9.8h, v9.8h, v11.8h\n"
- "smin v16.8h, v16.8h, v11.8h\n"
- "smin v25.8h, v25.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "tbz x6, #2, 61f\n"
- "st1 { v13.s }[0], [x12], #0x4\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
"st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v16.s }[0], [x10], #0x4\n"
- "st1 { v25.s }[0], [x9], #0x4\n"
- "tbz x6, #1, 60f\n"
- "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
"st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v16.h }[2], [x10], #0x2\n"
- "st1 { v25.h }[2], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v16.b }[6], [x10], #0x1\n"
- "st1 { v25.b }[6], [x9], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[4], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v16.b }[4], [x10], #0x1\n"
- "st1 { v25.b }[4], [x9], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x6, #1, 62f\n"
- "st1 { v13.h }[0], [x12], #0x2\n"
+ "tbz x7, #1, 62f\n"
"st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v16.h }[0], [x10], #0x2\n"
- "st1 { v25.h }[0], [x9], #0x2\n"
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v16.b }[2], [x10], #0x1\n"
- "st1 { v25.b }[2], [x9], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x6, #0, 63f\n"
- "st1 { v13.b }[0], [x12], #0x1\n"
+ "tbz x7, #0, 63f\n"
"st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v16.b }[0], [x10], #0x1\n"
- "st1 { v25.b }[0], [x9], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 9e80fbfc07..5d6fbac4bd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 5124b2c8f3..2cc802f9e6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -104,16 +104,16 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x8, x7, #0x3\n"
"add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "ld1r { v6.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
@@ -123,563 +123,563 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldp x11, x10, [x22, #0x0]\n"
"ldp x9, x28, [x22, #0x10]\n"
"cbz x8, 3f\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"subs x8, x8, #0x1\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d31, [x27, x17]\n"
- "ldr d30, [x26, x17]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d29, [x25, x17]\n"
- "ldr d28, [x24, x17]\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x23, x17]\n"
- "ldr d26, [x22, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x21, x17]\n"
- "ldr d24, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"add x14, x14, #0x48\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
"subs x8, x8, #0x1\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
"add x13, x13, #0x20\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
"sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
- "ldr q15, [x24, #0x0]\n"
- "ldr q17, [x24, #0x10]\n"
- "add x24, x24, #0x20\n"
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
"add x16, x16, #0x8\n"
- "str x24, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d8, [x14, #0x40]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr d31, [x27, x17]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
- "ldr d30, [x26, x17]\n"
- "ldr d29, [x25, x17]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ldr d28, [x24, x17]\n"
- "ldr d27, [x23, x17]\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d26, [x22, x17]\n"
- "ldr d25, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d24, [x20, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr d25, [x27, x17]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x21, [x15, #0x50]\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr q19, [x13, #0x0]\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x17]\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
"ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x17]\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x17]\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "ldr q18, [x12, #0x0]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
"ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x17]\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
"ldr x25, [x15, #0x98]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
"ldr d27, [x23, x17]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "ldr x23, [x15, #0xa8]\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "ldr d24, [x21, x17]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
- "ldr q30, [x13, #0x10]\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x24, x17]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x25, x17]\n"
- "ldr q31, [x12, #0x10]\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
"tst x7, #0x7\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "ldr d24, [x23, x17]\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"add x12, x12, #0x20\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "ldr d25, [x21, x17]\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "and v0.16b, v15.16b, v18.16b\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "add x17, x17, #0x8\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
"sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x11, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d10, [x10, x16]\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str d9, [x9, x16]\n"
- "str d21, [x28, x16]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
"add x16, x16, #0x8\n"
"beq 88f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x7, #2, 5f\n"
- "ld1 { v15.4s }, [x24], #0x10\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
"tbz x7, #1, 4f\n"
- "ld1 { v17.d }[0], [x24], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v3.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v17.s }[0], [x24]\n"
+ "ld1 { v3.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x7, #1, 6f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 7f\n"
- "ld1 { v15.s }[0], [x24]\n"
+ "ld1 { v5.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x14, #0x0]\n"
- "ldr d1, [x14, #0x8]\n"
- "mov v10.16b, v15.16b\n"
- "mov v20.16b, v17.16b\n"
- "ldr d2, [x14, #0x10]\n"
- "ldr d3, [x14, #0x18]\n"
- "mov v9.16b, v15.16b\n"
- "mov v23.16b, v17.16b\n"
- "ldr d4, [x14, #0x20]\n"
- "ldr d5, [x14, #0x28]\n"
- "mov v21.16b, v15.16b\n"
- "mov v22.16b, v17.16b\n"
- "ldr d6, [x14, #0x30]\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
"ldr d7, [x14, #0x38]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x14, #0x40]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
"ldp x27, x26, [x15, #0x0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v7.8h, v7.8b, v13.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
"add x27, x27, x17\n"
"add x26, x26, x17\n"
"add x25, x25, x17\n"
@@ -689,700 +689,700 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"add x21, x21, x17\n"
"add x20, x20, x17\n"
"tbz x7, #2, 9f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x7, #1, 8f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v29.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v26.b }[6], [x22]\n"
- "ld1 { v25.b }[6], [x21]\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v29.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v26.b }[4], [x22]\n"
- "ld1 { v25.b }[4], [x21]\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x7, #1, 10f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v29.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v26.b }[2], [x22]\n"
- "ld1 { v25.b }[2], [x21]\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 11f\n"
- "ld1 { v31.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v29.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v27.b }[0], [x23]\n"
- "ld1 { v26.b }[0], [x22]\n"
- "ld1 { v25.b }[0], [x21]\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v12.8b\n"
- "smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v17.4s, v31.8h, v8.8h\n"
- "ldr x24, [x15, #0x40]\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
- "smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "add x24, x24, x17\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v10.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v17.4s, v29.8h, v1.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v17.4s, v25.8h, v4.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v23.4s, v31.8h, v2.8h\n"
- "smlal v21.4s, v31.4h, v0.4h\n"
- "smlal2 v22.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
"tbz x7, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz x7, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v15.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v15.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x7, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v15.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v15.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal v10.4s, v29.4h, v4.4h\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "add x22, x22, x17\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x15, #0x50]\n"
- "smlal v10.4s, v28.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
"tbz x7, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v17.4s, v27.8h, v5.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x7, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v9.4s, v26.4h, v3.4h\n"
- "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 29f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 28f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
"tbz x7, #1, 30f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 31f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 33f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 32f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x7, #1, 34f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 35f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x21, [x15, #0x70]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 37f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 36f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x7, #1, 38f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 39f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x15, #0x78]\n"
- "smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v17.4s, v24.8h, v7.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v23.4s, v24.8h, v1.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x15, #0x80]\n"
- "smlal v21.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 45f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 44f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x7, #1, 46f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 47f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal v10.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v21.4s, v28.4h, v1.4h\n"
- "smlal2 v22.4s, v28.8h, v1.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 49f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 48f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x7, #1, 50f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 51f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x24, [x15, #0x90]\n"
- "smlal v21.4s, v26.4h, v5.4h\n"
- "smlal2 v22.4s, v26.8h, v5.8h\n"
- "add x24, x24, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 53f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 52f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x7, #1, 54f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 55f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v9.4s, v25.4h, v6.4h\n"
- "smlal2 v23.4s, v25.8h, v6.8h\n"
- "add x25, x25, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 57f\n"
- "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 56f\n"
- "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x7, #1, 58f\n"
- "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 59f\n"
- "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xa0]\n"
- "smlal v10.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v21.4s, v29.4h, v2.4h\n"
- "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 61f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 60f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x7, #1, 62f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 63f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x23, [x15, #0xa8]\n"
- "smlal v9.4s, v27.4h, v7.4h\n"
- "smlal2 v23.4s, v27.8h, v7.8h\n"
- "add x23, x23, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 65f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 64f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x7, #1, 66f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 67f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x15, #0xb0]\n"
- "smlal v9.4s, v24.4h, v5.4h\n"
- "smlal2 v23.4s, v24.8h, v5.8h\n"
- "smlal v21.4s, v24.4h, v3.4h\n"
- "smlal2 v22.4s, v24.8h, v3.8h\n"
- "add x22, x22, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 69f\n"
- "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 68f\n"
- "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x7, #1, 70f\n"
- "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 71f\n"
- "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x21, [x15, #0xb8]\n"
- "smlal v21.4s, v26.4h, v7.4h\n"
- "smlal2 v22.4s, v26.8h, v7.8h\n"
- "add x21, x21, x17\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
"tbz x7, #2, 73f\n"
- "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 72f\n"
- "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x7, #1, 74f\n"
- "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 75f\n"
- "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
"ldr x20, [x15, #0xc0]\n"
- "smlal v9.4s, v25.4h, v8.4h\n"
- "smlal2 v23.4s, v25.8h, v8.8h\n"
- "smlal v21.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
"add x20, x20, x17\n"
"tbz x7, #2, 77f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x7, #1, 76f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x7, #1, 78f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 79f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v21.4s, v29.4h, v8.4h\n"
- "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
"tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
"tbz x7, #1, 80f\n"
- "ld1 { v30.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v30.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
"tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "and v0.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
"add x11, x11, x16\n"
"add x10, x10, x16\n"
- "sqrdmulh v17.4s, v17.4s, v30.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"add x9, x9, x16\n"
"add x28, x28, x16\n"
- "and v7.16b, v17.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqadd v15.4s, v15.4s, v0.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v19.16b, v10.16b, v18.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "and v27.16b, v9.16b, v18.16b\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "and v0.16b, v21.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqadd v17.4s, v17.4s, v7.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v5.16b, v20.16b, v31.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v4.16b, v23.16b, v31.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v7.16b, v22.16b, v31.16b\n"
- "sqadd v10.4s, v10.4s, v19.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "srshl v17.4s, v17.4s, v31.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
"sqxtn v21.4h, v21.4s\n"
- "sqxtn2 v15.8h, v17.4s\n"
- "sqxtn2 v10.8h, v20.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
- "sqxtn2 v21.8h, v22.4s\n"
- "sqadd v15.8h, v15.8h, v11.8h\n"
- "sqadd v10.8h, v10.8h, v11.8h\n"
- "sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v21.8h, v21.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v16.8h\n"
- "smax v10.8h, v10.8h, v16.8h\n"
- "smax v9.8h, v9.8h, v16.8h\n"
- "smax v21.8h, v21.8h, v16.8h\n"
- "smin v15.8h, v15.8h, v14.8h\n"
- "smin v10.8h, v10.8h, v14.8h\n"
- "smin v9.8h, v9.8h, v14.8h\n"
- "smin v21.8h, v21.8h, v14.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"tbz x7, #2, 85f\n"
- "st1 { v15.s }[0], [x11], #0x4\n"
- "st1 { v10.s }[0], [x10], #0x4\n"
- "st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v21.s }[0], [x28], #0x4\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
"tbz x7, #1, 84f\n"
- "st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v10.h }[2], [x10], #0x2\n"
- "st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v21.h }[2], [x28], #0x2\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[6], [x11], #0x1\n"
- "st1 { v10.b }[6], [x10], #0x1\n"
- "st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v21.b }[6], [x28], #0x1\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[4], [x11], #0x1\n"
- "st1 { v10.b }[4], [x10], #0x1\n"
- "st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v21.b }[4], [x28], #0x1\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
"tbz x7, #1, 86f\n"
- "st1 { v15.h }[0], [x11], #0x2\n"
- "st1 { v10.h }[0], [x10], #0x2\n"
- "st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v21.h }[0], [x28], #0x2\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[2], [x11], #0x1\n"
- "st1 { v10.b }[2], [x10], #0x1\n"
- "st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v21.b }[2], [x28], #0x1\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x7, #0, 87f\n"
- "st1 { v15.b }[0], [x11], #0x1\n"
- "st1 { v10.b }[0], [x10], #0x1\n"
- "st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v21.b }[0], [x28], #0x1\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 19767e2823..32117ad1e6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
@@ -34,15 +34,7 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
- const unsigned int,
- const uint8_t *const *const,
- const int8_t *const,
- const int32_t *const,
- const arm_gemm::Requantize32 &,
- const int32_t *const,
- const int32_t *const,
- uint8_t *const *const);
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 1ce037b68c..df955206e2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -112,1188 +112,1188 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
"lsr x2, x1, #0x3\n"
- "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v9.16b }, [x3]\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
- "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x11]\n"
- "ld1r { v14.8h }, [x5]\n"
- "add x3, x13, %[offsetof_Requantize32_minval]\n"
- "add x15, x13, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.8h }, [x3]\n"
- "ld1r { v11.8h }, [x15]\n"
- "mov x0, #0x0\n"
- "mov x10, #0x0\n"
- "add x4, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x6, [x24, #0x0]\n"
- "ldp x7, x16, [x24, #0x10]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
"cbz x2, 3f\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
"subs x2, x2, #0x1\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ldr d31, [x9, x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldr d30, [x28, x0]\n"
- "ldr d29, [x27, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x26, x0]\n"
- "ldr d27, [x25, x0]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x24, x0]\n"
- "ldr d25, [x23, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x22, x0]\n"
- "ldr d26, [x21, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x20, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
"subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
"smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x25, [x4, #0xf0]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
"add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "add x3, x3, #0xc8\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "ldr q13, [x13, #0x0]\n"
- "ldr q19, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "add x10, x10, #0x8\n"
- "str x13, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr d31, [x9, x0]\n"
- "ldr d30, [x28, x0]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr d29, [x27, x0]\n"
- "ldr d28, [x26, x0]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d27, [x25, x0]\n"
- "ldr d23, [x24, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d25, [x23, x0]\n"
- "ldr d24, [x22, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d26, [x21, x0]\n"
- "ldr d22, [x20, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q18, [x5, #0x0]\n"
- "ldr q6, [x8, #0x0]\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr q5, [x5, #0x10]\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "ldr x22, [x4, #0x58]\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "ldr x20, [x4, #0x68]\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "ldr d30, [x22, x0]\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "ldr x25, [x4, #0x78]\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "ldr x24, [x4, #0x88]\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ldr d27, [x21, x0]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "ldr x21, [x4, #0x98]\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "ldr x13, [x4, #0xa8]\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr d25, [x20, x0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "ldr x20, [x4, #0xb8]\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "ldr x22, [x4, #0xc8]\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "ldr d24, [x26, x0]\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "ldr x28, [x4, #0xd8]\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x25, x0]\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr d28, [x24, x0]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "ldr x24, [x4, #0xf8]\n"
- "tst x1, #0x7\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "add x5, x5, #0x20\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "ldr d23, [x23, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr d31, [x15, x0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr d30, [x21, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "ldr d22, [x20, x0]\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x0]\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
"smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ldr d25, [x13, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x12, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "ldr d27, [x11, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "ldr d23, [x22, x0]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "ldr d31, [x9, x0]\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x27, x0]\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr d30, [x28, x0]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "ldr d26, [x26, x0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr d25, [x25, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr d24, [x24, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "ldr d27, [x23, x0]\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "ldr q22, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x15, x0]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x21, x0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
"smlal v20.4s, v30.4h, v2.4h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "add x0, x0, #0x8\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
"sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d13, [x17, x10]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d20, [x6, x10]\n"
- "str d8, [x7, x10]\n"
- "str d17, [x16, x10]\n"
- "add x10, x10, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x6, x6, #0xc8\n"
"3:" // Oddments
- "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v13.4s }, [x13], #0x10\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v15.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v13.d }[0], [x13], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[2], [x13]\n"
+ "ld1 { v7.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v13.s }[0], [x13]\n"
+ "ld1 { v7.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v20.16b, v13.16b\n"
- "mov v10.16b, v19.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v13.16b\n"
- "mov v7.16b, v19.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ldp x9, x28, [x4, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "mov v21.16b, v19.16b\n"
- "ldp x27, x26, [x4, #0x10]\n"
- "ldp x25, x24, [x4, #0x20]\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x4, #0x30]\n"
- "ldp x21, x20, [x4, #0x40]\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "add x9, x9, x0\n"
- "add x28, x28, x0\n"
- "add x27, x27, x0\n"
- "add x26, x26, x0\n"
- "add x25, x25, x0\n"
- "add x24, x24, x0\n"
- "add x23, x23, x0\n"
- "add x22, x22, x0\n"
- "add x21, x21, x0\n"
- "add x20, x20, x0\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 9f\n"
"ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "ld1 { v29.s }[0], [x27], #0x4\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
"ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
"ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
"ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[6], [x9]\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
"ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[4], [x9]\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
"ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
"ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "ld1 { v29.h }[0], [x27], #0x2\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
"ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[2], [x9]\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
"ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
"ld1 { v31.b }[0], [x9]\n"
- "ld1 { v30.b }[0], [x28]\n"
- "ld1 { v29.b }[0], [x27]\n"
- "ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x25]\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
"ld1 { v25.b }[0], [x23]\n"
- "ld1 { v24.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "ldr x20, [x4, #0x50]\n"
- "usubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v8.4s, v29.4h, v0.4h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "add x20, x20, x0\n"
- "smlal2 v7.4s, v29.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v1.4h\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v8.4s, v28.4h, v1.4h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v7.4s, v28.8h, v1.8h\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v19.4s, v27.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v23.4h, v2.4h\n"
- "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x22, [x4, #0x58]\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "add x22, x22, x0\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v31.4h, v3.4h\n"
- "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
"tbz x1, #2, 17f\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v6.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x21, [x4, #0x60]\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "add x21, x21, x0\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 21f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d0, [x3, #0x28]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v20.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v8.4s, v30.4h, v4.4h\n"
- "smlal2 v7.4s, v30.8h, v4.8h\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x4, #0x68]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v29.4h, v0.4h\n"
- "smlal2 v19.4s, v29.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v8.4s, v22.4h, v0.4h\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "ldr d14, [x6, #0x28]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
"tbz x1, #2, 25f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
@@ -1315,869 +1315,869 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 27f\n"
"ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x26, [x4, #0x70]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v20.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v25.4h, v1.4h\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "ldr d21, [x6, #0x30]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x25, [x4, #0x78]\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v23.4h, v2.4h\n"
- "smlal2 v19.4s, v23.8h, v2.8h\n"
- "smlal v20.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v8.4s, v24.4h, v2.4h\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "ldr d9, [x6, #0x38]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x23, [x4, #0x80]\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v31.4h, v3.4h\n"
- "smlal2 v19.4s, v31.8h, v3.8h\n"
- "smlal v20.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v27.4h, v3.4h\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "ldr d31, [x6, #0x40]\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x24, [x4, #0x88]\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "add x24, x24, x0\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v19.4s, v30.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "ldr d16, [x6, #0x48]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x15, [x4, #0x90]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v22.4h, v0.4h\n"
- "smlal2 v19.4s, v22.8h, v0.8h\n"
- "smlal v20.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x15], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x15], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x15]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x15]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x15], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x15]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x15]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x4, #0x98]\n"
- "smlal v8.4s, v31.4h, v0.4h\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x21, x21, x0\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x14, [x4, #0xa0]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "add x14, x14, x0\n"
- "smlal v13.4s, v25.4h, v1.4h\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "smlal v20.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v8.4s, v30.4h, v1.4h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "ldr d2, [x6, #0x58]\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x4, #0xa8]\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "add x13, x13, x0\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "smlal2 v19.4s, v24.8h, v2.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v8.4s, v26.4h, v2.4h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "ldr d25, [x6, #0x60]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[6], [x13]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[4], [x13]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[2], [x13]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v25.b }[0], [x13]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x4, #0xb0]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x12, x12, x0\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v19.4s, v27.8h, v3.8h\n"
- "smlal v20.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v24.s }[0], [x12], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v24.h }[2], [x12], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[6], [x12]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[4], [x12]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v24.h }[0], [x12], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[2], [x12]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v24.b }[0], [x12]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0xb8]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v23.4h, v4.4h\n"
- "smlal2 v19.4s, v23.8h, v4.8h\n"
- "smlal v20.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d16, [x6, #0x70]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v14.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "ld1 { v14.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v14.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "ld1 { v14.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x11, [x4, #0xc0]\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "add x11, x11, x0\n"
- "smlal v13.4s, v31.4h, v0.4h\n"
- "smlal2 v19.4s, v31.8h, v0.8h\n"
- "smlal v20.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d17, [x6, #0x78]\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x22, [x4, #0xc8]\n"
- "smlal v8.4s, v27.4h, v0.4h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x22, x22, x0\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 73f\n"
- "ld1 { v23.s }[0], [x22], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v23.h }[0], [x22], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v23.b }[0], [x22]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x9, [x4, #0xd0]\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "add x9, x9, x0\n"
- "smlal v13.4s, v30.4h, v1.4h\n"
- "smlal2 v19.4s, v30.8h, v1.8h\n"
- "smlal v20.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x28, [x4, #0xd8]\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "add x28, x28, x0\n"
- "smlal v13.4s, v26.4h, v2.4h\n"
- "smlal2 v19.4s, v26.8h, v2.8h\n"
- "smlal v20.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "ldr d12, [x6, #0x88]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x27, [x4, #0xe0]\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "add x27, x27, x0\n"
- "smlal v13.4s, v25.4h, v3.4h\n"
- "smlal2 v19.4s, v25.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "ldr d21, [x6, #0x90]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x26, [x4, #0xe8]\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "add x26, x26, x0\n"
- "smlal v13.4s, v24.4h, v4.4h\n"
- "smlal2 v19.4s, v24.8h, v4.8h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "ldr d8, [x6, #0x98]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v15.8b\n"
- "ldr x25, [x4, #0xf0]\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "add x25, x25, x0\n"
- "smlal v13.4s, v27.4h, v0.4h\n"
- "smlal2 v19.4s, v27.8h, v0.8h\n"
- "smlal v20.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "ldr d9, [x6, #0xa0]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
"tbz x1, #2, 93f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v12.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v12.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x1, #1, 94f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v12.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 95f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x24, [x4, #0xf8]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x24, x24, x0\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
"tbz x1, #2, 97f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x4, #0x100]\n"
- "smlal v17.4s, v24.4h, v0.4h\n"
- "smlal2 v21.4s, v24.8h, v0.8h\n"
- "add x23, x23, x0\n"
- "smlal v13.4s, v23.4h, v1.4h\n"
- "smlal2 v19.4s, v23.8h, v1.8h\n"
- "smlal v20.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d12, [x6, #0xa8]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v15.8b\n"
- "ldr x15, [x4, #0x108]\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "add x15, x15, x0\n"
- "smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d28, [x6, #0xb0]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v25.s }[0], [x15], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v25.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[6], [x15]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[4], [x15]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v25.h }[0], [x15], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[2], [x15]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v25.b }[0], [x15]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v15.8b\n"
- "ldr x21, [x4, #0x110]\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "add x21, x21, x0\n"
- "smlal v13.4s, v30.4h, v3.4h\n"
- "smlal2 v19.4s, v30.8h, v3.8h\n"
- "smlal v20.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v8.4s, v25.4h, v3.4h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "ldr d30, [x6, #0xb8]\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr x20, [x4, #0x118]\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "add x20, x20, x0\n"
- "smlal v13.4s, v28.4h, v4.4h\n"
- "smlal2 v19.4s, v28.8h, v4.8h\n"
- "smlal v20.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v8.4s, v24.4h, v4.4h\n"
- "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "ldr d8, [x6, #0xc0]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v9.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v18.4s }, [x5], #0x10\n"
- "ld1 { v6.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v5.d }[0], [x5], #0x8\n"
- "ld1 { v22.d }[0], [x8], #0x8\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[2], [x5]\n"
- "ld1 { v22.s }[2], [x8]\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v5.s }[0], [x5]\n"
- "ld1 { v22.s }[0], [x8]\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v6.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v6.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v6.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "and v30.16b, v13.16b, v6.16b\n"
- "add x17, x17, x10\n"
- "add x6, x6, x10\n"
- "sqrdmulh v19.4s, v19.4s, v5.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x7, x7, x10\n"
- "add x16, x16, x10\n"
- "and v16.16b, v19.16b, v22.16b\n"
- "sqrdmulh v20.4s, v20.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v0.16b, v20.16b, v6.16b\n"
- "sqrdmulh v10.4s, v10.4s, v5.4s\n"
- "and v18.16b, v8.16b, v6.16b\n"
- "sqrdmulh v7.4s, v7.4s, v5.4s\n"
- "and v30.16b, v17.16b, v6.16b\n"
- "sqrdmulh v21.4s, v21.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v10.16b, v22.16b\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "and v23.16b, v7.16b, v22.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v16.16b, v21.16b, v22.16b\n"
- "sqadd v20.4s, v20.4s, v0.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v6.4s\n"
- "srshl v20.4s, v20.4s, v6.4s\n"
- "sqadd v10.4s, v10.4s, v26.4s\n"
- "srshl v8.4s, v8.4s, v6.4s\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "srshl v17.4s, v17.4s, v6.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v22.4s\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v7.4s, v7.4s, v22.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "sqxtn v17.4h, v17.4s\n"
- "sqxtn2 v13.8h, v19.4s\n"
- "sqxtn2 v20.8h, v10.4s\n"
- "sqxtn2 v8.8h, v7.4s\n"
- "sqxtn2 v17.8h, v21.4s\n"
- "sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v20.8h, v20.8h, v14.8h\n"
- "sqadd v8.8h, v8.8h, v14.8h\n"
- "sqadd v17.8h, v17.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v20.8h, v20.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smax v17.8h, v17.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v11.8h\n"
- "smin v20.8h, v20.8h, v11.8h\n"
- "smin v8.8h, v8.8h, v11.8h\n"
- "smin v17.8h, v17.8h, v11.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v13.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x7], #0x4\n"
- "st1 { v17.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v13.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x7], #0x2\n"
- "st1 { v17.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x7], #0x1\n"
- "st1 { v17.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x7], #0x1\n"
- "st1 { v17.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v13.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x7], #0x2\n"
- "st1 { v17.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x7], #0x1\n"
- "st1 { v17.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v13.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x7], #0x1\n"
- "st1 { v17.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
index 6bdcca115c..2c677d2f62 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
@@ -47,4 +47,5 @@ class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 1676119bc1..c2bec4cdab 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -41,7 +42,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "lsr x12, %x[n_channels], #0x2\n"
+ "lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
@@ -59,7 +60,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
"ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "cbz x12, 6f\n"
+ "cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
@@ -67,34 +68,34 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
- "subs x20, %x[n_points], #0x1\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldr s16, [x28, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x27, x11]\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x26, x11]\n"
- "ldr s19, [x25, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr s20, [x24, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -103,35 +104,35 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x10, x11]\n"
- "ldr s15, [x9, x11]\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x28, x11]\n"
- "ldr s17, [x27, x11]\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldr s18, [x26, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x25, x11]\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x20, x20, #0x1\n"
+ "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x24, x11]\n"
+ "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x23, x11]\n"
- "ldr x22, [x21], #0x8\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x22, x11]\n"
+ "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
@@ -167,45 +168,45 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v24.4s, v24.4s, v2.4s\n"
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -270,7 +271,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"str s30, [x21, x11]\n"
"str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x12, LSL #2\n"
+ "cmp x11, x9, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
@@ -288,61 +289,61 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Bit 1: End
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x21, %x[inptrs]\n"
- "ldp x10, x9, [x21], #0x10\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x28, x27, [x21], #0x10\n"
- "ldp x26, x25, [x21], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldr x22, [x21], #0x8\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
"mov v30.16b, v23.16b\n"
- "add x10, x10, x11\n"
"add x9, x9, x11\n"
+ "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x28, x28, x11\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
"subs x20, %x[n_points], #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -356,62 +357,62 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "ldp x10, x9, [x21], #0x10\n"
- "ldp x28, x27, [x21], #0x10\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x26, x25, [x21], #0x10\n"
- "ldp x24, x23, [x21], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x22, [x21], #0x8\n"
- "add x10, x10, x11\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "add x9, x9, x11\n"
"add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
- "add x27, x27, x11\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h14, [x10], #0x2\n"
- "ldr h15, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h17, [x27], #0x2\n"
- "ldr h18, [x26], #0x2\n"
- "ldr h19, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h21, [x23], #0x2\n"
- "ldr h22, [x22], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.b }[2], [x10], #0x1\n"
- "ld1 { v15.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v17.b }[2], [x27], #0x1\n"
- "ld1 { v18.b }[2], [x26], #0x1\n"
- "ld1 { v19.b }[2], [x25], #0x1\n"
- "ld1 { v20.b }[2], [x24], #0x1\n"
- "ld1 { v21.b }[2], [x23], #0x1\n"
- "ld1 { v22.b }[2], [x22], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "ldr b14, [x10], #0x1\n"
- "ldr b15, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b17, [x27], #0x1\n"
- "ldr b18, [x26], #0x1\n"
- "ldr b19, [x25], #0x1\n"
- "ldr b20, [x24], #0x1\n"
- "ldr b21, [x23], #0x1\n"
- "ldr b22, [x22], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
"subs x20, x20, #0x1\n"
"usubl v14.8h, v14.8b, v6.8b\n"
@@ -457,9 +458,7 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"cbz %x[rq_left_shift_ptr], 19f\n"
"ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
-
"20:" // Oddments: Load quantisation parameters: Bit 1: End
-
"21:" // Oddments: Load quantisation parameters: Done
"sshl v23.4s, v23.4s, v3.4s\n"
"sshl v24.4s, v24.4s, v3.4s\n"
@@ -473,11 +472,11 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"sqrdmulh v25.4s, v25.4s, v2.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"add x28, x28, x11\n"
- "and v21.16b, v23.16b, v1.16b\n"
- "and v20.16b, v24.16b, v1.16b\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
- "and v19.16b, v25.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
@@ -490,36 +489,36 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"add x21, x21, x11\n"
"add x20, x20, x11\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "and v18.16b, v26.16b, v1.16b\n"
- "and v17.16b, v27.16b, v1.16b\n"
- "and v16.16b, v28.16b, v1.16b\n"
- "and v21.16b, v29.16b, v1.16b\n"
- "and v20.16b, v30.16b, v1.16b\n"
- "and v19.16b, v31.16b, v1.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v23.4s, v23.4s, v1.4s\n"
"srshl v24.4s, v24.4s, v1.4s\n"
"srshl v25.4s, v25.4s, v1.4s\n"
@@ -606,15 +605,14 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"st1 { v30.b }[0], [x21], #0x1\n"
"st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
-
"24:" // End
-
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 394df363da..b7ba363b43 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 976434aa28..ed99f1f642 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -22,12 +22,13 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
@@ -47,21 +48,21 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
__asm__ __volatile__(
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v3.16b }, [x20]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"ld1r { v12.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "ld1r { v11.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v10.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
@@ -89,256 +90,256 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"cbz %x[rq_mul_ptr], 3f\n"
"lsl x20, x9, #0x2\n"
"ldr q9, [%x[rq_mul_ptr], x20]\n"
- "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 7f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -347,263 +348,263 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"6:" // Output channel loop: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldr d7, [x28, #0x0]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -612,224 +613,224 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"b 8f\n"
"7:" // Output channel loop: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "sshl v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -838,62 +839,62 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x20, x9]\n"
+ "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x21, x9]\n"
+ "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x22, x9]\n"
+ "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x23, x9]\n"
+ "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s28, [x24, x9]\n"
- "str s29, [x25, x9]\n"
- "str s30, [x26, x9]\n"
- "str s31, [x27, x9]\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
"cmp x9, x10, LSL #2\n"
@@ -936,354 +937,354 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x22], #0x8\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
"ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
"ld1 { v9.s }[0], [x22], #0x4\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s8, [%x[weights]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "ldp x25, x28, [x20], #0x10\n"
- "lsr x21, %x[kernel_points], #0x1\n"
- "ldr d2, [x25, #0x0]\n"
- "ldr d7, [x28, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "cbz x21, 22f\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "subs x21, x21, #0x1\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d1, [x25, #0x0]\n"
- "ldr d0, [x28, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "subs x21, x21, #0x1\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "ldr d1, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v3.8b\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "ldr d0, [x28, #0x0]\n"
- "ldr s6, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v3.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "ldp x25, x28, [x20], #0x10\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "ldr d2, [x25, #0x0]\n"
- "usubl v2.8h, v2.8b, v3.8b\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
- "ldr d7, [x28, #0x0]\n"
- "ldr s8, [%x[weights]], #0x4\n"
- "smlal v16.4s, v6.4h, v1.h[0]\n"
- "smlal v17.4s, v6.4h, v1.h[1]\n"
- "usubl v7.8h, v7.8b, v3.8b\n"
- "smlal v18.4s, v6.4h, v1.h[2]\n"
- "smlal v19.4s, v6.4h, v1.h[3]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "smlal v20.4s, v6.4h, v1.h[4]\n"
- "smlal v21.4s, v6.4h, v1.h[5]\n"
- "smlal v22.4s, v6.4h, v1.h[6]\n"
- "smlal v23.4s, v6.4h, v1.h[7]\n"
- "smlal v24.4s, v6.4h, v0.h[0]\n"
- "smlal v25.4s, v6.4h, v0.h[1]\n"
- "smlal v26.4s, v6.4h, v0.h[2]\n"
- "smlal v27.4s, v6.4h, v0.h[3]\n"
- "smlal v28.4s, v6.4h, v0.h[4]\n"
- "smlal v29.4s, v6.4h, v0.h[5]\n"
- "smlal v30.4s, v6.4h, v0.h[6]\n"
- "smlal v31.4s, v6.4h, v0.h[7]\n"
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v16.4s, v8.4h, v2.h[0]\n"
- "smlal v17.4s, v8.4h, v2.h[1]\n"
- "smlal v18.4s, v8.4h, v2.h[2]\n"
- "smlal v19.4s, v8.4h, v2.h[3]\n"
- "smlal v20.4s, v8.4h, v2.h[4]\n"
- "smlal v21.4s, v8.4h, v2.h[5]\n"
- "smlal v22.4s, v8.4h, v2.h[6]\n"
- "smlal v23.4s, v8.4h, v2.h[7]\n"
- "smlal v24.4s, v8.4h, v7.h[0]\n"
- "smlal v25.4s, v8.4h, v7.h[1]\n"
- "smlal v26.4s, v8.4h, v7.h[2]\n"
- "smlal v27.4s, v8.4h, v7.h[3]\n"
- "smlal v28.4s, v8.4h, v7.h[4]\n"
- "smlal v29.4s, v8.4h, v7.h[5]\n"
- "smlal v30.4s, v8.4h, v7.h[6]\n"
- "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v16.4s, v16.4s, v15.4s\n"
- "sshl v17.4s, v17.4s, v15.4s\n"
- "sshl v18.4s, v18.4s, v15.4s\n"
- "sshl v19.4s, v19.4s, v15.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "and v5.16b, v16.16b, v10.16b\n"
- "and v4.16b, v17.16b, v10.16b\n"
- "and v2.16b, v18.16b, v10.16b\n"
- "and v1.16b, v19.16b, v10.16b\n"
- "sshl v20.4s, v20.4s, v15.4s\n"
- "sshl v21.4s, v21.4s, v15.4s\n"
- "sshl v22.4s, v22.4s, v15.4s\n"
- "sshl v23.4s, v23.4s, v15.4s\n"
- "sshl v24.4s, v24.4s, v15.4s\n"
- "sshl v25.4s, v25.4s, v15.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v20.4s, v20.4s, v9.4s\n"
"sqrdmulh v21.4s, v21.4s, v9.4s\n"
"sqrdmulh v22.4s, v22.4s, v9.4s\n"
"sqrdmulh v23.4s, v23.4s, v9.4s\n"
"sqrdmulh v24.4s, v24.4s, v9.4s\n"
"sqrdmulh v25.4s, v25.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v5.4s\n"
- "sqadd v17.4s, v17.4s, v4.4s\n"
- "sqadd v18.4s, v18.4s, v2.4s\n"
- "sqadd v19.4s, v19.4s, v1.4s\n"
- "and v8.16b, v20.16b, v10.16b\n"
- "and v0.16b, v21.16b, v10.16b\n"
- "and v5.16b, v22.16b, v10.16b\n"
- "and v4.16b, v23.16b, v10.16b\n"
- "and v2.16b, v24.16b, v10.16b\n"
- "and v1.16b, v25.16b, v10.16b\n"
- "sshl v26.4s, v26.4s, v15.4s\n"
- "sshl v27.4s, v27.4s, v15.4s\n"
- "sshl v28.4s, v28.4s, v15.4s\n"
- "sshl v29.4s, v29.4s, v15.4s\n"
- "sshl v30.4s, v30.4s, v15.4s\n"
- "sshl v31.4s, v31.4s, v15.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v9.4s\n"
"sqrdmulh v27.4s, v27.4s, v9.4s\n"
"sqrdmulh v28.4s, v28.4s, v9.4s\n"
"sqrdmulh v29.4s, v29.4s, v9.4s\n"
"sqrdmulh v30.4s, v30.4s, v9.4s\n"
"sqrdmulh v31.4s, v31.4s, v9.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v0.4s\n"
- "sqadd v22.4s, v22.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sqadd v25.4s, v25.4s, v1.4s\n"
- "and v8.16b, v26.16b, v10.16b\n"
- "and v0.16b, v27.16b, v10.16b\n"
- "and v5.16b, v28.16b, v10.16b\n"
- "and v4.16b, v29.16b, v10.16b\n"
- "and v2.16b, v30.16b, v10.16b\n"
- "and v1.16b, v31.16b, v10.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
"sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
"sshr v1.4s, v1.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v8.4s\n"
- "sqadd v27.4s, v27.4s, v0.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v2.4s\n"
- "sqadd v31.4s, v31.4s, v1.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v10.4s\n"
- "srshl v21.4s, v21.4s, v10.4s\n"
- "srshl v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v10.4s\n"
- "srshl v27.4s, v27.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v10.4s\n"
- "srshl v29.4s, v29.4s, v10.4s\n"
- "srshl v30.4s, v30.4s, v10.4s\n"
- "srshl v31.4s, v31.4s, v10.4s\n"
- "add v16.4s, v16.4s, v14.4s\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v23.4s, v23.4s, v14.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v16.4s, v16.4s, v11.4s\n"
- "smin v17.4s, v17.4s, v11.4s\n"
- "smin v18.4s, v18.4s, v11.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v13.4s\n"
- "smax v17.4s, v17.4s, v13.4s\n"
- "smax v18.4s, v18.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v23.4s, v23.4s, v13.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
@@ -1317,158 +1318,156 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.h }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.h }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.h }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.h }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.h }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.h }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.h }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v24.h }[0], [x20]\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v26.h }[0], [x22]\n"
- "st1 { v27.h }[0], [x23]\n"
- "st1 { v28.h }[0], [x24]\n"
- "st1 { v29.h }[0], [x25]\n"
- "st1 { v30.h }[0], [x26]\n"
- "st1 { v31.h }[0], [x27]\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[2], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[2], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[2], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[2], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[2], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[2], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[2], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[2], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[2], [x20]\n"
- "st1 { v25.b }[2], [x21]\n"
- "st1 { v26.b }[2], [x22]\n"
- "st1 { v27.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x24]\n"
- "st1 { v29.b }[2], [x25]\n"
- "st1 { v30.b }[2], [x26]\n"
- "st1 { v31.b }[2], [x27]\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "add x20, x20, x9\n"
- "add x21, x21, x9\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "add x22, x22, x9\n"
- "add x23, x23, x9\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "add x24, x24, x9\n"
- "add x25, x25, x9\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "add x26, x26, x9\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"add x27, x27, x9\n"
- "st1 { v16.b }[0], [x20]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x20, x20, x9\n"
- "st1 { v17.b }[0], [x21]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "add x21, x21, x9\n"
- "st1 { v18.b }[0], [x22]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "add x22, x22, x9\n"
- "st1 { v19.b }[0], [x23]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "add x23, x23, x9\n"
- "st1 { v20.b }[0], [x24]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "add x24, x24, x9\n"
- "st1 { v21.b }[0], [x25]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "add x25, x25, x9\n"
- "st1 { v22.b }[0], [x26]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v23.b }[0], [x27]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"add x27, x27, x9\n"
- "st1 { v24.b }[0], [x20]\n"
- "st1 { v25.b }[0], [x21]\n"
- "st1 { v26.b }[0], [x22]\n"
- "st1 { v27.b }[0], [x23]\n"
- "st1 { v28.b }[0], [x24]\n"
- "st1 { v29.b }[0], [x25]\n"
- "st1 { v30.b }[0], [x26]\n"
- "st1 { v31.b }[0], [x27]\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
-
"26:" // Done
-
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -1477,4 +1476,5 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index a200ebf2cc..25d83f15c3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 2ee961db15..96cfd5e497 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -151,7 +151,7 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x4, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x2\n"
- "ld1w { z18.s }, p3/Z, [x15]\n"
+ "ld1w { z22.s }, p3/Z, [x15]\n"
"ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x5, x25, x21\n" // offset += tile_j * ld_output_col
"addvl x15, x15, #1\n"
@@ -159,13 +159,13 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
"mul x21, x21, x20\n" // offset *= output_tile_size
"cntw x23\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"addvl x15, x15, #4\n"
"add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"addvl x15, x15, #4\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x23, %x[n_channels]\n"
"add x22, x24, x22, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x15]\n"
@@ -179,71 +179,71 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
"whilelt p1.s, x23, %x[n_channels]\n"
"incw x21\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x14]\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x14]\n"
"incw x23\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x14, x13, LSL #2]\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x17, x16, LSL #2]\n"
"incw x20\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x6, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x7, x16, LSL #2]\n"
"addvl x7, x7, #1\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ld1w { z18.s }, p3/Z, [x15]\n"
+ "ld1w { z22.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x8]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z9.s }, p2/Z, [x8]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
"addvl x8, x8, #1\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z18.s }, p2/Z, [x17]\n"
+ "fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
"addvl x17, x17, #1\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
"ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x6, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
"addvl x15, x15, #4\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
"addvl x15, x15, #4\n"
"cmp x23, %x[n_channels]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+ ".inst 0xc1aecabc // fclamp { z28.s-z31.s }, z21.s, z14.s\n"
"addvl x14, x14, #1\n"
"ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
"ld1w { z10.s }, p1/Z, [x7]\n"
@@ -259,69 +259,69 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"addvl x15, x15, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z22\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z22\n fmla z25.s, p3/M, z3.s, z9.s\n"
"ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x5, x5, #0x1\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x14]\n"
+ "movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
+ "movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x17, x16, LSL #2]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x6, LSL #2]\n"
"cmp x5, x20\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
"add x20, x4, #0x1\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
"csel x4, x4, x20, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x8]\n"
+ "fmla z26.s, p3/M, z4.s, z13.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x8]\n"
"csel x5, x5, XZR, LT\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
"cmp x4, x21\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x17, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x6, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x16, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x24]\n"
- "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "fmla z27.s, p3/M, z4.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x17]\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "fmla z27.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z25.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1aecab8 // fclamp { z24.s-z27.s }, z21.s, z14.s\n"
+ "st1w { z24.s }, p0, [x24]\n"
+ "st1w { z25.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z26.s }, p0, [x22]\n"
+ "st1w { z27.s }, p0, [x22, x25, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 079b39c5ec..39f1b3635f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -84,7 +84,7 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z18.s }, p3/Z, [x14]\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
"ldp x13, x12, [x20, #0x0]\n"
"cntw x11\n"
@@ -94,176 +94,176 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"mov x28, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
- "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
"addvl x14, x14, #4\n"
"cmp x11, %x[n_channels]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x25, x22, [x15, #0x10]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x24, XZR, x11\n"
- "ldr x23, [x15, #0x20]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x27, XZR, x11\n"
+ "ldr x20, [x15, #0x20]\n"
"ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
+ "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0x28]\n"
"whilelt p1.s, x11, %x[n_channels]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldr x21, [x15, #0x30]\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x20, [x15, #0x38]\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x26, [x15, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x27, [x15, #0x40]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x40]\n"
+ "fmla z30.s, p3/M, z6.s, z19.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
+ "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x50]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x22, [x15, #0x58]\n"
- "ld1w { z18.s }, p3/Z, [x14]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldr x21, [x15, #0x70]\n"
+ "fmla z31.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z17.s\n"
+ "ldr x26, [x15, #0x70]\n"
"addvl x14, x14, #1\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ldr x20, [x15, #0x78]\n"
+ "fmla z28.s, p3/M, z2.s, z25.s\n"
+ "fmla z29.s, p3/M, z1.s, z25.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "incw x24\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldp x25, x22, [x15, #0x10]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x23, x11, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "incw x27\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "fmla z29.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
"incw x28\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x27, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
"whilelt p2.s, x28, %x[n_channels]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "ld1w { z10.s }, p1/Z, [x26, x11, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x25, x11, LSL #2]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x24, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x22, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
+ ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+ "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
"incw x11\n"
"cmp x11, %x[n_channels]\n"
- "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
+ "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "st1w { z30.s }, p0, [x10, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "st1w { z31.s }, p0, [x9, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
- "incw x24\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ldr x21, [x15, #0x30]\n"
+ "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "incw x27\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x30]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x20, [x15, #0x38]\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x26, [x15, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x27, [x15, #0x40]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x40]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x22, [x15, #0x58]\n"
+ "ldr x21, [x15, #0x58]\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
"ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z20.s\n"
+ "fmla z31.s, p3/M, z4.s, z20.s\n"
"ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
"ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x24, LSL #2]\n"
- "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x10, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x9, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z20.s\n"
+ "fmla z29.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+ "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index cf74f431df..bd330dc21e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index ce0ae29756..d15a3a8377 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -170,11 +170,11 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x2, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x3\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
+ "ld1w { z24.s }, p3/Z, [x17]\n"
"ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x3, x27, x21\n" // offset += tile_j * ld_output_col
"mul x21, x21, x20\n" // offset *= output_tile_size
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
"addvl x17, x17, #1\n"
"add x26, x26, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
@@ -184,7 +184,7 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"add x24, x26, x22, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"addvl x17, x17, #4\n"
"cmp x25, %x[n_channels]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
@@ -200,275 +200,275 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z27, z24\n fmla z27.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x25, %x[n_channels]\n"
"incw x21\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
"incw x25\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
"incw x20\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x4, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x5, x4, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z15.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z8.s, z15.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z19.s\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z24.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x16]\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "fmla z22.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z3.s, z17.s\n"
+ "fmla z29.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z17.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
"addvl x6, x6, #1\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z18.s\n"
+ "fmla z27.s, p3/M, z4.s, z18.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z18.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
"addvl x16, x16, #1\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z20.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
"addvl x5, x5, #1\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
"addvl x7, x7, #1\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z19.s\n"
+ "fmla z27.s, p3/M, z6.s, z18.s\n"
"ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
"addvl x14, x14, #1\n"
"cmp x25, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z26.s\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
"ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+ ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
"ld1w { z12.s }, p1/Z, [x14]\n"
- "st1w { z23.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x26]\n"
"ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
- "st1w { z24.s }, p0, [x26, x27, LSL #2]\n"
- "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
"addvl x26, x26, #1\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1w { z26.s }, p0, [x24]\n"
- "st1w { z27.s }, p0, [x24, x27, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
"addvl x24, x24, #1\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z25, z24\n fmla z25.s, p3/M, z8.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x3, x3, #0x1\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"add x21, x2, #0x1\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"cmp x3, x20\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z27.s }, p2/Z, [x7, x15, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x4, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
"csel x2, x2, x21, LT\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z17.s\n"
+ "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
"mov p0.b, p2.b\n"
"csel x3, x3, XZR, LT\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "cmp x2, x20\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x5, x4, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
"fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z23.s }, p0, [x26]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p0, [x26, x27, LSL #2]\n"
- "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
- "st1w { z26.s }, p0, [x24]\n"
- "st1w { z27.s }, p0, [x24, x27, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "cmp x2, x20\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z19.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x16]\n"
+ "fmla z20.s, p3/M, z4.s, z27.s\n"
+ "fmla z25.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z27.s\n"
+ "fmla z23.s, p3/M, z1.s, z27.s\n"
+ "fmla z28.s, p3/M, z8.s, z27.s\n"
+ "fmla z29.s, p3/M, z7.s, z27.s\n"
+ "fmla z31.s, p3/M, z5.s, z27.s\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z22.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z17.s\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z17.s\n"
+ "fmla z28.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
+ "fmla z20.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z19.s\n"
+ "fmla z25.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z25.s, p3/M, z25.s, z26.s\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+ "st1w { z25.s }, p0, [x26]\n"
+ ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+ "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index fd648a392f..2c868b6cf3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -87,354 +87,354 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "cntw x11\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "ldr x10, [x16, #0x20]\n"
- "mov x9, #0x0\n"
+ "ld1w { z20.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "cntw x16\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ldr x20, [x17, #0x20]\n"
+ "mov x15, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "cmp x11, %x[n_channels]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x27, XZR, x11\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ld1w { z9.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x13, XZR, x16\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "incw x27\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x16, #0x38]\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x22, [x17, #0x30]\n"
+ "incw x13\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ldr x25, [x17, #0x38]\n"
"mov p1.b, p2.b\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x24, [x16, #0x28]\n"
- "whilelt p0.s, x11, %x[n_channels]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [x16, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x17, #0x48]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x15, [x16, #0x40]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x13, [x16, #0x50]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z23.s\n"
+ "ldr x24, [x17, #0x50]\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x12, [x16, #0x58]\n"
+ "ldr x23, [x17, #0x58]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x10, [x16, #0x60]\n"
+ "ldr x22, [x17, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x25, [x16, #0x78]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x15, [x16, #0x80]\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x14, [x16, #0x88]\n"
- "addvl x17, x17, #1\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "ld1w { z12.s }, p2/Z, [x10, x9, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x10, [x16, #0xa0]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x23, [x28, #0x0]\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ldr x21, [x28, #0x10]\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x9, LSL #2]\n"
- "ldr x10, [x16, #0x20]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "incw x9\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "ld1w { z9.s }, p0/Z, [x15, x11, LSL #2]\n"
- "whilelt p2.s, x9, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x28]\n"
- "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- "ld1w { z10.s }, p0/Z, [x14, x11, LSL #2]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "ld1w { z11.s }, p0/Z, [x13, x11, LSL #2]\n"
- "st1w { z27.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "ld1w { z12.s }, p0/Z, [x12, x11, LSL #2]\n"
- "ld1w { z13.s }, p0/Z, [x10, x11, LSL #2]\n"
- "incw x11\n"
- "cmp x11, %x[n_channels]\n"
- "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
- "st1w { z31.s }, p1, [x23, x27, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z23.s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x78]\n"
+ "fmla z26.s, p3/M, z4.s, z23.s\n"
+ "fmla z27.s, p3/M, z3.s, z23.s\n"
+ "ldr x20, [x17, #0x80]\n"
+ "ld1w { z20.s }, p3/Z, [x8]\n"
+ "fmla z30.s, p3/M, z0.s, z23.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "ldr x11, [x17, #0x88]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z29.s, p3/M, z1.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z26.s, p3/M, z0.s, z18.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ldr x10, [x14, #0x0]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "ldr x9, [x14, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z23.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z4.s, z16.s\n"
+ "ldr x28, [x14, #0x10]\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z15.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z15.s\n"
+ "ldr x27, [x14, #0x18]\n"
+ "fmla z30.s, p3/M, z6.s, z19.s\n"
+ "fmla z24.s, p3/M, z3.s, z23.s\n"
+ "fmla z27.s, p3/M, z0.s, z23.s\n"
+ "fmla z31.s, p3/M, z5.s, z15.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z23.s\n"
+ "fmla z26.s, p3/M, z1.s, z23.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x20]\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x22, x21, [x17, #0x0]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "ldp x25, x24, [x17, #0x10]\n"
+ "incw x15\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p1, [x10, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+ "st1w { z24.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x28]\n"
+ "st1w { z25.s }, p1, [x28, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x30]\n"
+ "ld1w { z10.s }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+ "st1w { z26.s }, p1, [x27, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x38]\n"
+ "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x40]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+ "ld1w { z13.s }, p0/Z, [x26, x16, LSL #2]\n"
+ "incw x16\n"
+ "cmp x16, %x[n_channels]\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "incw x27\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x16, #0x38]\n"
- "mov p1.b, p2.b\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "incw x13\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x38]\n"
+ "mov p0.b, p2.b\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x24, [x16, #0x28]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [x16, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x17, #0x48]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x15, [x16, #0x40]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x13, [x16, #0x50]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x12, [x16, #0x58]\n"
+ "ldr x24, [x17, #0x58]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x10, [x16, #0x60]\n"
+ "ldr x23, [x17, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x25, [x16, #0x78]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x15, [x16, #0x80]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x14, [x16, #0x88]\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "ld1w { z12.s }, p2/Z, [x10, x9, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x10, [x16, #0xa0]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x23, [x28, #0x0]\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ldr x21, [x28, #0x10]\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x15, x9, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x28]\n"
- "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "st1w { z27.s }, p1, [x23, x27, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
- "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
- "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
- "st1w { z31.s }, p1, [x23, x27, LSL #2]\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z27.s, p3/M, z3.s, z18.s\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "ldr x11, [x17, #0x88]\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x10, [x17, #0x90]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x9, [x17, #0x98]\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "ldr x28, [x17, #0xa0]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "ldr x27, [x14, #0x0]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "ldr x26, [x14, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ldr x24, [x14, #0x10]\n"
+ "fmla z21.s, p3/M, z3.s, z20.s\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ldr x21, [x14, #0x18]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z3.s, z18.s\n"
+ "fmla z27.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p0, [x27, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+ "st1w { z24.s }, p0, [x26, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x28]\n"
+ "st1w { z25.s }, p0, [x24, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x30]\n"
+ ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+ "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x38]\n"
+ "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x40]\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 9184cc00e4..add666e14e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 5380567d36..efd37c38ec 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -193,18 +193,18 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x2, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x4\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
+ "ld1w { z14.s }, p3/Z, [x17]\n"
"ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x3, x9, x21\n" // offset += tile_j * ld_output_col
"mul x21, x21, x20\n" // offset *= output_tile_size
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
"add x28, x28, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"addvl x17, x17, #1\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"add x27, x28, x22, LSL #2\n"
"cntw x26\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"addvl x17, x17, #4\n"
"add x25, x27, x22, LSL #2\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
@@ -224,440 +224,440 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"addvl x17, x17, #1\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x26, %x[n_channels]\n"
"incw x21\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z3.s, z9.s\n"
+ "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
"incw x26\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
"incw x20\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z30, z14\n fmla z30.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z5.s, z9.s\n"
+ "movprfx z16, z14\n fmla z16.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x11, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x4, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x11, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "movprfx z31, z14\n fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z3.s, z12.s\n"
+ "movprfx z19, z14\n fmla z19.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z8.s, z22.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z3.s, z9.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z14.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "fmla z16.s, p3/M, z5.s, z9.s\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
"ld1w { z9.s }, p2/Z, [x6]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "fmla z22.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z9.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
"ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z7.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
"addvl x5, x5, #1\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z25.s, p3/M, z6.s, z11.s\n"
+ "fmla z16.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "fmla z20.s, p3/M, z1.s, z11.s\n"
+ "fmla z21.s, p3/M, z0.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
"ld1w { z10.s }, p2/Z, [x7]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
"addvl x7, x7, #1\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x16]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z2.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
"addvl x16, x16, #1\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z7.s, z10.s\n"
+ "fmla z22.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "fmla z17.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "fmla z20.s, p3/M, z5.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z22.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z8.s, z10.s\n"
"addvl x12, x12, #1\n"
"ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z19.s, p3/M, z6.s, z11.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
"ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
"addvl x6, x6, #1\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "fmla z31.s, p3/M, z4.s, z11.s\n"
"cmp x26, %x[n_channels]\n"
"addvl x14, x14, #1\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z11.s\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
+ "fmla z16.s, p3/M, z7.s, z12.s\n"
+ "fmla z17.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z12.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z10.s\n"
+ "fmla z19.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z4.s, z10.s\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
+ ".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p0, [x28]\n"
- "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
- "st1w { z17.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z18.s }, p0, [x28, x24, LSL #2]\n"
+ ".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+ "st1w { z28.s }, p0, [x28]\n"
+ "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1w { z19.s }, p0, [x28, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x22, LSL #2]\n"
"addvl x28, x28, #1\n"
- "st1w { z20.s }, p0, [x27]\n"
- "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
- "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
- "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
+ "st1w { z24.s }, p0, [x27]\n"
+ "st1w { z25.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x27, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x27, x22, LSL #2]\n"
"addvl x27, x27, #1\n"
- "st1w { z24.s }, p0, [x25]\n"
- "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x25, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "st1w { z17.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x25, x22, LSL #2]\n"
"addvl x25, x25, #1\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z20.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z8.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x3, x3, #0x1\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z1.s, z9.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"add x21, x2, #0x1\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "movprfx z30, z14\n fmla z30.s, p3/M, z0.s, z9.s\n"
"fmla z21.s, p3/M, z5.s, z12.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"cmp x3, x20\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z7.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z6.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"csel x2, x2, x21, LT\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
"mov p0.b, p2.b\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
"csel x3, x3, XZR, LT\n"
"fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x12, x11, LSL #2]\n"
"cmp x2, x20\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "movprfx z16, z14\n fmla z16.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
"fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x4, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z8.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z31, z14\n fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "movprfx z19, z14\n fmla z19.s, p3/M, z8.s, z18.s\n"
"fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
"fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x11, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "fmla z16.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x6]\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z14.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "fmla z16.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
"ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z10.s\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x6]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z27.s, p3/M, z3.s, z12.s\n"
"fmla z22.s, p3/M, z1.s, z12.s\n"
"fmla z23.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
"fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
"ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z12.s\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
"fmla z22.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z14.s\n"
+ "fmla z21.s, p3/M, z6.s, z14.s\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "fmla z16.s, p3/M, z1.s, z14.s\n"
+ "fmla z17.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x7]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
+ "fmla z18.s, p3/M, z2.s, z14.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
"fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z14.s\n"
+ "fmla z23.s, p3/M, z7.s, z14.s\n"
+ "fmla z30.s, p3/M, z5.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z14.s\n"
+ "fmla z19.s, p3/M, z1.s, z14.s\n"
"ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "fmla z26.s, p3/M, z1.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x11, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z14.s\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z16.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z10.s\n"
+ "fmla z19.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z14.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z17.s, p3/M, z8.s, z14.s\n"
"ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z7.s, z14.s\n"
+ "fmla z19.s, p3/M, z6.s, z14.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
"fmla z22.s, p3/M, z2.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p0, [x28]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z17.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z18.s }, p0, [x28, x24, LSL #2]\n"
- "st1w { z19.s }, p0, [x28, x22, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "fmla z29.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ ".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ ".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ "st1w { z25.s }, p0, [x28, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x28, x22, LSL #2]\n"
"st1w { z20.s }, p0, [x27]\n"
"st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
"st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
"st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
- "st1w { z24.s }, p0, [x25]\n"
- "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x25, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x25, x22, LSL #2]\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x25]\n"
+ "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z16.s }, p0, [x23]\n"
+ "st1w { z17.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index d904f68806..2e2a45bab0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -98,552 +98,552 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "cntw x11\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "mov x10, #0x0\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "cntw x16\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "mov x15, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "addvl x17, x17, #4\n"
- "cmp x11, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
"ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x28, XZR, x11\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ld1w { z9.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x13, XZR, x16\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "incw x28\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
+ "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "incw x13\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [x17, #0x30]\n"
"mov p1.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "whilelt p0.s, x11, %x[n_channels]\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x15, [x16, #0x40]\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x23, [x17, #0x38]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x40]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x14, [x16, #0x48]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x13, [x16, #0x50]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x50]\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z8.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
+ "ldr x26, [x17, #0x60]\n"
"fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z29.s, p3/M, z7.s, z9.s\n"
"fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x16, #0x58]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
+ "ldr x21, [x17, #0x58]\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z21.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x15, [x16, #0x80]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x17, #0x88]\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
- "ldr x14, [x16, #0x88]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "ldr x23, [x9, #0x0]\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "ldr x22, [x9, #0x8]\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "ldr x21, [x9, #0x10]\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x9, #0x18]\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
+ "ldr x12, [x14, #0x0]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z17.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z22.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
"fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0xc8]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xc8]\n"
"fmla z17.s, p3/M, z5.s, z12.s\n"
"fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
"fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x15, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x12, [x16, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "fmla z21.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "fmla z17.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z18.s, p3/M, z8.s, z9.s\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z9.s\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z19.s, p3/M, z7.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z9.s\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z16.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xf8]\n"
+ "fmla z29.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "fmla z25.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z2.s, z9.s\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "fmla z19.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z16.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x110]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "ldr x21, [x17, #0x118]\n"
+ "fmla z20.s, p3/M, z0.s, z11.s\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z8.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z12.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldp x15, x14, [x16, #0x0]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z22.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldp x20, x25, [x17, #0x0]\n"
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
"fmla z17.s, p3/M, z3.s, z10.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
"fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldp x13, x12, [x16, #0x10]\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x20]\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z0.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldp x20, x24, [x17, #0x10]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "incw x15\n"
+ "ld1w { z11.s }, p0/Z, [x20, x16, LSL #2]\n"
+ ".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "st1w { z16.s }, p1, [x12, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
"fmla z24.s, p3/M, z7.s, z12.s\n"
- "st1w { z17.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x28]\n"
+ "st1w { z17.s }, p1, [x11, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x28]\n"
"fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "st1w { z18.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x30]\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z19.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x38]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "st1w { z20.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x40]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "st1w { z21.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x48]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- "incw x10\n"
- "st1w { z22.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x50]\n"
- "ld1w { z9.s }, p0/Z, [x15, x11, LSL #2]\n"
- "whilelt p2.s, x10, %x[n_channels]\n"
- "st1w { z23.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x58]\n"
- "ld1w { z10.s }, p0/Z, [x14, x11, LSL #2]\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z24.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x60]\n"
- "ld1w { z11.s }, p0/Z, [x13, x11, LSL #2]\n"
- "st1w { z25.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x68]\n"
- "ld1w { z12.s }, p0/Z, [x12, x11, LSL #2]\n"
- "incw x11\n"
- "st1w { z26.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x70]\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z27.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x78]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "cmp x11, %x[n_channels]\n"
- "st1w { z28.s }, p1, [x23, x28, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z29.s }, p1, [x22, x28, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x28, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z0.s\n"
+ "st1w { z18.s }, p1, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z27.s, p3/M, z7.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "st1w { z19.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z20.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z12.s\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmla z22.s, p3/M, z5.s, z0.s\n"
+ "fmla z23.s, p3/M, z4.s, z0.s\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ "ld1w { z10.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+ "incw x16\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "addvl x8, x8, #1\n"
+ "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+ "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "incw x28\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "mov p1.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x15, [x16, #0x40]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x14, [x16, #0x48]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "incw x13\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ldr x23, [x17, #0x28]\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z7.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x22, [x17, #0x38]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x40]\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x13, [x16, #0x50]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x50]\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x16, #0x58]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "ldr x26, [x17, #0x60]\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z29.s, p3/M, z7.s, z9.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ldr x20, [x17, #0x58]\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z17.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x15, [x16, #0x80]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
- "ldr x14, [x16, #0x88]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "ldr x23, [x9, #0x0]\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "ldr x22, [x9, #0x8]\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "ldr x21, [x9, #0x10]\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x9, #0x18]\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "fmla z16.s, p3/M, z2.s, z9.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z21.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z16.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "ldr x15, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "ldr x12, [x16, #0xd8]\n"
+ "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "fmla z23.s, p3/M, z3.s, z13.s\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "fmla z31.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z16.s, p3/M, z7.s, z9.s\n"
+ "fmla z17.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xd0]\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z26.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z12.s\n"
"fmla z28.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xf8]\n"
"fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x10, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x10, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
"fmla z24.s, p3/M, z4.s, z11.s\n"
"fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x15, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x10, LSL #2]\n"
- "ldr x14, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
- "ldr x13, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x12, [x16, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "fmla z17.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z21.s, p3/M, z2.s, z9.s\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z20.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z16.s, p3/M, z0.s, z12.s\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "fmla z16.s, p3/M, z5.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z19.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z12.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- ".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x20]\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "st1w { z17.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x28]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "st1w { z18.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x30]\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- ".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z19.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x38]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "st1w { z20.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x40]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "st1w { z21.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x48]\n"
- ".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- ".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z22.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x50]\n"
- "st1w { z23.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x58]\n"
- "st1w { z24.s }, p1, [x23, x28, LSL #2]\n"
- "ldr x23, [x9, #0x60]\n"
- "st1w { z25.s }, p1, [x22, x28, LSL #2]\n"
- "ldr x22, [x9, #0x68]\n"
- "st1w { z26.s }, p1, [x21, x28, LSL #2]\n"
- "ldr x21, [x9, #0x70]\n"
- "st1w { z27.s }, p1, [x20, x28, LSL #2]\n"
- "ldr x20, [x9, #0x78]\n"
- "st1w { z28.s }, p1, [x23, x28, LSL #2]\n"
- "st1w { z29.s }, p1, [x22, x28, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x28, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x28, LSL #2]\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z22.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z0.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "st1w { z20.s }, p0, [x12, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "st1w { z21.s }, p0, [x11, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
+ "fmla z26.s, p3/M, z8.s, z0.s\n"
+ "st1w { z22.s }, p0, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z27.s, p3/M, z7.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z16.s, p3/M, z4.s, z13.s\n"
+ "fmla z17.s, p3/M, z3.s, z13.s\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmla z18.s, p3/M, z5.s, z0.s\n"
+ "fmla z19.s, p3/M, z4.s, z0.s\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ ".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "st1w { z24.s }, p0, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "st1w { z25.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "st1w { z16.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z17.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z18.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z19.s }, p0, [x20, x13, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index a4ca907e1b..dcffffeb21 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
namespace arm_conv {
namespace depthwise {
@@ -68,4 +68,4 @@ class sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index f7f67855c1..066b935486 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -170,7 +170,7 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mul x21, x2, x22\n" // offset = tile_i * ld_output_row
"mov x20, #0x2\n"
- "ld1w { z19.s }, p3/Z, [x17]\n"
+ "ld1w { z22.s }, p3/Z, [x17]\n"
"ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"madd x21, x3, x25, x21\n" // offset += tile_j * ld_output_col
"addvl x17, x17, #1\n"
@@ -178,13 +178,13 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
"mul x21, x21, x20\n" // offset *= output_tile_size
"cntw x23\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"addvl x17, x17, #4\n"
"add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"addvl x17, x17, #4\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x23, %x[n_channels]\n"
"add x22, x24, x22, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
@@ -201,73 +201,73 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
"whilelt p1.s, x23, %x[n_channels]\n"
"incw x21\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
"incw x23\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x6, x15, LSL #2]\n"
"mov p0.b, p2.b\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x6, x8, LSL #2]\n"
"addvl x5, x5, #1\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z29.s, p3/M, z4.s, z27.s\n"
+ "ld1w { z25.s }, p2/Z, [x16]\n"
"addvl x6, x6, #1\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z12.s }, p2/Z, [x7]\n"
"incw x20\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z25.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x7, x13, LSL #2]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
"addvl x7, x7, #1\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z22.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
"addvl x16, x16, #1\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "ld1w { z19.s }, p3/Z, [x17]\n"
+ "ld1w { z22.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
"cmp x23, %x[n_channels]\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
+ ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
"addvl x14, x14, #1\n"
@@ -291,71 +291,71 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"addvl x17, x17, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"add x3, x3, #0x1\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x6, x8, LSL #2]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x16]\n"
"cmp x3, x20\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
"add x20, x2, #0x1\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x7, x15, LSL #2]\n"
"csel x2, x2, x20, LT\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
"csel x3, x3, XZR, LT\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
"cmp x2, x21\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x13, LSL #2]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
"st1w { z28.s }, p0, [x24]\n"
"st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
"st1w { z30.s }, p0, [x22]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index e2ff9a214e..dc7a40ff54 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
#include <cstddef>
#include <cstdint>
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
namespace arm_conv {
namespace depthwise {
@@ -93,7 +93,7 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z19.s }, p3/Z, [x15]\n"
+ "ld1w { z26.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"ldp x14, x13, [x20, #0x0]\n"
"cntw x12\n"
@@ -103,119 +103,119 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- "ldp x28, x27, [x16, #0x0]\n"
+ "ldp x28, x26, [x16, #0x0]\n"
"addvl x15, x15, #4\n"
"cmp x12, %x[n_channels]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x24, XZR, x12\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x27, XZR, x12\n"
"ldp x23, x22, [x16, #0x20]\n"
"ld1w { z8.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"ldp x21, x20, [x16, #0x30]\n"
"ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
"ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
"ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
"ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
"ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x28, [x16, #0x40]\n"
+ "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x16, #0x40]\n"
"whilelt p1.s, x12, %x[n_channels]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x16, #0x48]\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ldr x26, [x16, #0x50]\n"
+ "ld1w { z22.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x16, #0x58]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z14.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z22.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z23.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x16, #0x60]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x28, [x16, #0x80]\n"
- "ld1w { z12.s }, p2/Z, [x28, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x16, #0x60]\n"
"ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ldr x27, [x16, #0x88]\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ldr x20, [x16, #0x88]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x21, [x16, #0x70]\n"
- "ldr x25, [x16, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x26, [x16, #0x90]\n"
+ "fmla z28.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z4.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x90]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla z31.s, p3/M, z2.s, z4.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ldr x21, [x16, #0xb0]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x28, [x16, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ldp x28, x27, [x16, #0x0]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "ld1w { z19.s }, p3/Z, [x15]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z4.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldp x20, x26, [x16, #0x0]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ld1w { z26.s }, p3/Z, [x15]\n"
"addvl x15, x15, #1\n"
"incw x9\n"
"ldp x23, x22, [x16, #0x20]\n"
- "ld1w { z9.s }, p1/Z, [x28, x12, LSL #2]\n"
- "incw x24\n"
+ "ld1w { z9.s }, p1/Z, [x20, x12, LSL #2]\n"
+ "incw x27\n"
"mov p0.b, p2.b\n"
"ldp x21, x20, [x16, #0x30]\n"
- "ld1w { z10.s }, p1/Z, [x27, x12, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
"whilelt p2.s, x9, %x[n_channels]\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "ld1w { z11.s }, p1/Z, [x26, x12, LSL #2]\n"
- "st1w { z28.s }, p0, [x14, x24, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x25, x12, LSL #2]\n"
- "st1w { z29.s }, p0, [x13, x24, LSL #2]\n"
+ ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "ld1w { z11.s }, p1/Z, [x25, x12, LSL #2]\n"
+ "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x24, x12, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
"ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
"ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
"ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
"incw x12\n"
@@ -228,83 +228,83 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"addvl x15, x15, #1\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x28, [x16, #0x40]\n"
- "incw x24\n"
+ "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x16, #0x40]\n"
+ "incw x27\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x16, #0x48]\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ldr x26, [x16, #0x50]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x50]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x16, #0x58]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z14.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x16, #0x60]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x28, [x16, #0x80]\n"
- "ld1w { z12.s }, p2/Z, [x28, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ldr x27, [x16, #0x88]\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x16, #0x60]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ldr x20, [x16, #0x88]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x21, [x16, #0x70]\n"
- "ldr x25, [x16, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x26, [x16, #0x90]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x90]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ldr x21, [x16, #0xb0]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x28, [x16, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
"mov p0.b, p2.b\n"
- ".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "st1w { z28.s }, p0, [x14, x24, LSL #2]\n"
- "st1w { z29.s }, p0, [x13, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x24, LSL #2]\n"
+ ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
index d29d0b5496..061b0a1e2e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
index 4d02d29e4e..a385893146 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -69,69 +69,69 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z5.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z11.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z16.s, #0x0\n"
+ "fmov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z16.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x14, #0x1\n"
"orr x24, x20, %x[ld_in_col], LSL #18\n"
- "mov z17.d, z16.d\n"
+ "mov z21.d, z20.d\n"
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
- "orr x24, x17, x24, LSL #20\n"
+ ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x16, x24, LSL #20\n"
"mov x22, #0x6\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z3.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z10.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
- "add x21, x7, x6\n"
- ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+ "add x21, x17, x7\n"
+ ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z18.d, z16.d\n"
- "mov z19.d, z16.d\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
"ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
"lsl x24, x24, #0x2\n"
"sub x22, x22, x21\n"
"ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
- "madd x20, x20, x7, x14\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
"mov x10, #0x2\n"
- "ldp x9, x28, [x11], #0x10\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"ldp x27, x26, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x25, x24, [x11], #0x10\n"
+ "ldp x25, x24, [x22], #0x10\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -140,8 +140,8 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- "sub x13, x13, x21\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
"st1w { z12.s }, p1, [x9]\n"
@@ -154,298 +154,298 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x10, 8f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13619c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z6.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
- ".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
- ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc13819c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z8.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc13019c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z0.s\n"
+ ".inst 0xc13519e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z5.s\n"
+ ".inst 0xc13419e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
"8:" // Unpadded: 0 priming loads
- "cbz x15, 16f\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "sub x15, x15, #0x1\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x15, x13\n"
+ "cbz x14, 16f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "sub x14, x14, #0x1\n"
"ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x15, x13, LT\n"
+ "sub x11, x11, #0x1\n"
"ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "cmp x14, x11\n"
"ld1w { z28.s }, p1/Z, [x20]\n"
- "sub x13, x13, x21\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x14, x11, LT\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- ".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- "ld1w { z23.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
- ".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
- ".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
+ ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
"st1w { z12.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
"st1w { z13.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z14.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x10, 13f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1301980 // fmla za.s[x8, 0], { z12.s-z15.s }, z0.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301981 // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
+ ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
"13:" // Padded: 0 priming loads
- "cbz x15, 16f\n"
+ "cbz x14, 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "sub x15, x15, #0x1\n"
- "sub x13, x13, #0x1\n"
+ "sub x14, x14, #0x1\n"
+ "sub x11, x11, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "cmp x15, x13\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ "cmp x14, x11\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "csel x21, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "sub x13, x13, x21\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- "ld1w { z23.s }, p0/Z, [x14]\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
- ".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
- ".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x9]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x28]\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z17.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z18.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x8, x8, #0x1\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x25, x25, x23, LSL #2\n"
"add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- ".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
- ".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
- ".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
- ".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x9]\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x28]\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z17.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
"add x8, x8, #0x1\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z18.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"16:" // Main loop skip tail
- "cbz x13, 18f\n"
+ "cbz x11, 18f\n"
"17:" // Right padding loop
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x9]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1b8c848 // fclamp { z8.s-z11.s }, z2.s, z24.s\n"
+ "st1w { z8.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "st1w { z13.s }, p1, [x28]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "st1w { z9.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z10.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "st1w { z11.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
index 18a572954a..711f7f479a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
index 9f6b09ef88..26315101b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -69,69 +69,69 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z7.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z19.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z24.s, #0x0\n"
+ "fmov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z24.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x14, #0x1\n"
"orr x24, x20, %x[ld_in_col], LSL #18\n"
- "mov z25.d, z24.d\n"
+ "mov z13.d, z12.d\n"
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa0404ae2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x23]\n"
- "orr x24, x17, x24, LSL #20\n"
+ ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x16, x24, LSL #20\n"
"mov x22, #0x9\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z7.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
- "add x21, x7, x6\n"
- ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
+ "add x21, x17, x7\n"
+ ".inst 0xa0404ae0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x23]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z26.d, z24.d\n"
- "mov z27.d, z24.d\n"
- "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "mov z14.d, z12.d\n"
+ "mov z15.d, z12.d\n"
+ "ld1w { z5.s }, p2/Z, [x23, #2, MUL VL]\n"
"addvl x23, x23, #3\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ae1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x23]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa1404ae3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x23]\n"
"lsl x24, x24, #0x2\n"
"sub x22, x22, x21\n"
- "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
- "madd x20, x20, x7, x14\n"
+ "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x2\n"
- "ldp x10, x9, [x11], #0x10\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x28, x27, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x11], #0x10\n"
+ "ldp x26, x25, [x23], #0x10\n"
"ldp x24, x23, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
@@ -142,9 +142,9 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
"lsr x21, x21, #0x1\n"
- "sub x13, x13, x21\n"
+ "sub x11, x11, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
"st1w { z20.s }, p1, [x10]\n"
@@ -157,490 +157,490 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
"add x25, x25, x23, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x22, 8f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321980 // fmla za.s[x8, 0], { z12.s-z15.s }, z2.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc1321a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z2.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z4.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc13119a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z1.s\n"
+ ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ ".inst 0xc1331a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z10.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ ".inst 0xc1311a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z1.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc13b1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z11.s\n"
"8:" // Unpadded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "sub x15, x15, #0x2\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
+ "sub x11, x11, #0x1\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "lsr x20, x15, #0x1\n"
- "ld1w { z30.s }, p1/Z, [x21]\n"
+ "lsr x20, x14, #0x1\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "cmp x20, x13\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
+ "cmp x20, x11\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "csel x22, x20, x13, LT\n"
- "ld1w { z31.s }, p1/Z, [x21]\n"
+ "csel x22, x20, x11, LT\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "and x15, x15, #0x1\n"
- "ld1w { z0.s }, p1/Z, [x21]\n"
+ "and x14, x14, #0x1\n"
+ "ld1w { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, x22\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "sub x11, x11, x22\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
"cbz x22, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
"subs x22, x22, #0x1\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x21]\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
"st1w { z20.s }, p1, [x10]\n"
- "ld1w { z31.s }, p1/Z, [x21]\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z21.s }, p1, [x9]\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
"add x9, x9, x27, LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x21]\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
"st1w { z22.s }, p1, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0xc13b1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z11.s\n"
"add x26, x26, x24, LSL #2\n"
"st1w { z23.s }, p1, [x25]\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x22, 13f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1321980 // fmla za.s[x8, 0], { z12.s-z15.s }, z2.s\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1341ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z4.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13119a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z1.s\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z3.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13b1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z11.s\n"
"13:" // Padded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x15, x15, #0x2\n"
- "ld1w { z31.s }, p0/Z, [x21]\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
- "sub x13, x13, #0x1\n"
- "lsr x20, x15, #0x1\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "cmp x20, x13\n"
+ "cmp x20, x11\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "csel x22, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "csel x22, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "sub x11, x11, x22\n"
"cbz x22, 15f\n"
"14:" // Padded: Main loop
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "st1w { z20.s }, p1, [x10]\n"
+ "st1w { z28.s }, p1, [x10]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x9]\n"
+ "st1w { z29.s }, p1, [x9]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "st1w { z30.s }, p1, [x26]\n"
"mov x12, #0x8\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
+ "st1w { z31.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
+ ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
"mov x12, #0x0\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x10, x10, x28, LSL #2\n"
"add x9, x9, x27, LSL #2\n"
"add x26, x26, x24, LSL #2\n"
"add x25, x25, x23, LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z20.s }, p1, [x10]\n"
+ "st1w { z28.s }, p1, [x10]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x9]\n"
+ "st1w { z29.s }, p1, [x9]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "st1w { z30.s }, p1, [x26]\n"
"mov x12, #0x8\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
+ "st1w { z31.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x10, x10, x28, LSL #2\n"
"add x9, x9, x27, LSL #2\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
"16:" // Main loop skip tail
- "cbz x15, 17f\n" // Skip remainder inputs
+ "cbz x14, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x10]\n"
+ ".inst 0xc1351b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z5.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xc1301b81 // fmla za.s[x8, 1], { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a9c8f0 // fclamp { z16.s-z19.s }, z7.s, z9.s\n"
+ "st1w { z16.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
+ ".inst 0xc1331ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z3.s\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x9]\n"
+ "st1w { z17.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "st1w { z23.s }, p1, [x25]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
- "cbz x13, 19f\n"
+ "cbz x11, 19f\n"
"18:" // Right padding loop
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x10]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1a9c8e0 // fclamp { z0.s-z3.s }, z7.s, z9.s\n"
+ "st1w { z0.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "st1w { z21.s }, p1, [x9]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z22.s }, p1, [x26]\n"
+ "st1w { z2.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
+ "st1w { z3.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 18b\n"
"19:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
index 0fa0300f9f..71487e08b6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
index bf12b42ddc..3741b973b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -69,71 +69,71 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x5\n"
- "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z22.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x7\n"
+ "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z11.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x6\n"
+ "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x16, #0x1\n"
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
"mov z29.d, z28.d\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "orr x23, x7, x23, LSL #20\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "orr x23, x17, x23, LSL #20\n"
"mov x22, #0x8\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x21, x6, x5\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "add x21, x7, x6\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"mov z30.d, z28.d\n"
"mov z31.d, z28.d\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x6, x14\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "madd x20, x20, x7, x13\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x6, x20, x14\n"
+ "msub x13, x7, x20, x13\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x10, #0x4\n"
- "ldp x9, x28, [x11], #0x10\n"
+ "ldp x9, x28, [x22], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x27, x26, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x25, x24, [x11], #0x10\n"
+ "ldp x25, x24, [x22], #0x10\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -141,308 +141,308 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "sub x13, x13, x21\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z25.s }, p1, [x28]\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x6, x5\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
"cbz x10, 10f\n"
"cmp x10, #0x1\n"
- "sub x16, x16, x10\n"
+ "sub x15, x15, x10\n"
"beq 9f\n"
"cmp x10, #0x2\n"
"beq 8f\n"
"cmp x10, #0x3\n"
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1341a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- "7:" // Unpadded: 3 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1351a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z5.s\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ ".inst 0xc13e1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z14.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1351a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z5.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1351a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13219c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z2.s\n"
- ".inst 0xc13519c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z5.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13019e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z0.s\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1381a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z8.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1301a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z11.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc1351ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z5.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z4.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z12.s\n"
+ ".inst 0xc13f1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z15.s\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1822 // fmla za.s[x8, 2], { z1.s-z4.s }, z14.s\n"
+ "ld1w { z6.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381840 // fmla za.s[x8, 0], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13b1841 // fmla za.s[x8, 1], { z2.s-z5.s }, z11.s\n"
+ ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13a1842 // fmla za.s[x8, 2], { z2.s-z5.s }, z10.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z14.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13d1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z13.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1862 // fmla za.s[x8, 2], { z3.s-z6.s }, z12.s\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301880 // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13f1881 // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1882 // fmla za.s[x8, 2], { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc13c18a0 // fmla za.s[x8, 0], { z5.s-z8.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13118a1 // fmla za.s[x8, 1], { z5.s-z8.s }, z1.s\n"
+ ".inst 0xc13018a2 // fmla za.s[x8, 2], { z5.s-z8.s }, z0.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"9:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13319c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z2.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+ ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z11.s\n"
+ ".inst 0xa14149c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13a1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z5.s\n"
+ ".inst 0xc1341ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1361ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
+ ".inst 0xc1391ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z9.s\n"
+ ".inst 0xc1311ae3 // fmla za.s[x8, 3], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xc13d1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z13.s\n"
+ ".inst 0xc13c1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc1341b03 // fmla za.s[x8, 3], { z24.s-z27.s }, z4.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "10:" // Unpadded: 0 priming loads
+ "cbz x15, 20f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "sub x15, x15, #0x1\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+ "sub x11, x11, #0x1\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13019e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
+ "cmp x15, x11\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- ".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
- ".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "10:" // Unpadded: 0 priming loads
- "cbz x16, 20f\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "sub x16, x16, #0x1\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "csel x21, x15, x11, LT\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x16, x13\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x16, x13, LT\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "sub x11, x11, x21\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "cbz x21, 19f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13d1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z13.s\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, x21\n"
+ ".inst 0xc13c1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z12.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z15.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13e1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z14.s\n"
+ ".inst 0xc1381aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xc1301aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z0.s\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
- "cbz x21, 19f\n"
- "11:" // Unpadded: Main loop
- ".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- ".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- "ld1w { z14.s }, p1/Z, [x14]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
- ".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca0c // fclamp { z12.s-z15.s }, z16.s, z17.s\n"
+ "st1w { z12.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x28]\n"
+ ".inst 0xc1371ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z7.s\n"
+ "st1w { z13.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "st1w { z26.s }, p1, [x25]\n"
+ ".inst 0xc1361ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc1351ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z5.s\n"
+ "st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z4.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
"cbz x10, 17f\n"
"cmp x10, #0x1\n"
- "sub x16, x16, x10\n"
+ "sub x15, x15, x10\n"
"beq 16f\n"
"cmp x10, #0x2\n"
"beq 15f\n"
@@ -451,429 +451,429 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
- "addvl x15, x15, #5\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1341a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z3.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xc13f1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1801 // fmla za.s[x8, 1], { z0.s-z3.s }, z14.s\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0xc13b1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z11.s\n"
+ "ld1w { z5.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13a1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z10.s\n"
+ ".inst 0xa04049c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1351a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z5.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1391840 // fmla za.s[x8, 0], { z2.s-z5.s }, z9.s\n"
+ "ld1w { z6.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1351a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc1381841 // fmla za.s[x8, 1], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13f1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z15.s\n"
+ "ld1w { z7.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z14.s\n"
+ ".inst 0xa14049c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1880 // fmla za.s[x8, 0], { z4.s-z7.s }, z11.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331881 // fmla za.s[x8, 1], { z4.s-z7.s }, z3.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13219c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13c1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13519c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z5.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13f1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z15.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xc13e1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1381a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13b1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z11.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13019e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1381a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc13a1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xa14049c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1341ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xc1321ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z2.s\n"
+ ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xc1361ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z6.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13319c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z3.s\n"
+ ".inst 0xc13d1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z13.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13219c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z2.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xc13f1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z15.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13419c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1391a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z9.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z8.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13019e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
- ".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z5.s\n"
- ".inst 0xc1341a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc13b1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13a1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc1391aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xc1311aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z1.s\n"
+ ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+ ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z11.s\n"
+ ".inst 0xc13a1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
"17:" // Padded: 0 priming loads
- "cbz x16, 20f\n"
+ "cbz x15, 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x16, x16, #0x1\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "sub x13, x13, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "sub x11, x11, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "cmp x16, x13\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "cmp x15, x11\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "csel x21, x16, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "sub x13, x13, x21\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "csel x21, x15, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
"cbz x21, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- "ld1w { z14.s }, p0/Z, [x14]\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1301a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z0.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
- ".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1331a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc1371aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z7.s\n"
+ ".inst 0xc1361aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z6.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x28]\n"
+ ".inst 0xc13b1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z11.s\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "st1w { z26.s }, p1, [x25]\n"
+ ".inst 0xc1331ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z3.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc13f1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z15.s\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z14.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
"add x8, x8, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- ".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
- ".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
- ".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- ".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
- ".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
- ".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- ".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
- ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+ "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z4.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1321a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc13b1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z11.s\n"
+ ".inst 0xc13a1aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x28]\n"
+ ".inst 0xc13d1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z13.s\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- "st1w { z26.s }, p1, [x25]\n"
+ ".inst 0xc13c1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z12.s\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc1331ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z3.s\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
+ ".inst 0xc1321ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z2.s\n"
"add x8, x8, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"20:" // Main loop skip tail
- "cbz x13, 22f\n"
+ "cbz x11, 22f\n"
"21:" // Right padding loop
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1b1ca00 // fclamp { z0.s-z3.s }, z16.s, z17.s\n"
+ "st1w { z0.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "st1w { z25.s }, p1, [x28]\n"
+ "st1w { z1.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z2.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z3.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 21b\n"
"22:" // End
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x15, ALL, MUL #16\n"
- "incb x15, ALL, MUL #9\n"
- "str x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
index cae4b24e66..7412c7b57c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
index 755265835d..81ad8e5833 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -76,11 +76,11 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z0.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x6\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
@@ -99,64 +99,64 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"orr x23, x7, x23, LSL #20\n"
"mov x22, #0xb\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"mov z30.d, z28.d\n"
"mov z31.d, z28.d\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"mov x8, #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"sub x22, x22, x21\n"
"madd x20, x20, x6, x14\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
"msub x14, x6, x20, x14\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x4\n"
- "ldp x10, x9, [x11], #0x10\n"
+ "ldp x11, x10, [x23], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x28, x27, [x20], #0x10\n"
+ "ldp x9, x28, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x26, x25, [x11], #0x10\n"
- "ldp x24, x23, [x20], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
+ ".inst 0xc1a3c850 // fclamp { z16.s-z19.s }, z2.s, z3.s\n"
"lsr x21, x21, #0x1\n"
"sub x13, x13, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x10]\n"
+ "st1w { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ "st1w { z17.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z19.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z27.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
"adds XZR, x6, x5\n"
@@ -171,331 +171,331 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z9.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
"ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc1371a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z7.s\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1341980 // fmla za.s[x8, 0], { z12.s-z15.s }, z4.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341940 // fmla za.s[x8, 0], { z10.s-z13.s }, z4.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+ ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
"7:" // Unpadded: 3 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc13f18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z15.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1351980 // fmla za.s[x8, 0], { z12.s-z15.s }, z5.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1371900 // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13b1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z11.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"8:" // Unpadded: 2 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z19.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
- ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xc1341a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z4.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z2.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc13019c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z0.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13719c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z7.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1361980 // fmla za.s[x8, 0], { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1381a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+ ".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z14.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13619a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z6.s\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"9:" // Unpadded: 1 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z7.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc13b18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z11.s\n"
+ ".inst 0xc13518e1 // fmla za.s[x8, 1], { z7.s-z10.s }, z5.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc13d1900 // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s\n"
+ ".inst 0xc1311901 // fmla za.s[x8, 1], { z8.s-z11.s }, z1.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1921 // fmla za.s[x8, 1], { z9.s-z12.s }, z15.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"10:" // Unpadded: 0 priming loads
"cmp x16, #0x2\n"
"blt 20f\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
"sub x16, x16, #0x2\n"
- "ld1w { z21.s }, p1/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x13, x13, #0x1\n"
- "ld1w { z12.s }, p1/Z, [x21]\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"lsr x20, x16, #0x1\n"
- "ld1w { z22.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"cmp x20, x13\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
+ "ld1w { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "csel x22, x20, x13, LT\n"
- "ld1w { z23.s }, p1/Z, [x21]\n"
+ "csel x23, x20, x13, LT\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"and x16, x16, #0x1\n"
- "ld1w { z24.s }, p1/Z, [x21]\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, x22\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "sub x13, x13, x23\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- "cbz x22, 19f\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "cbz x23, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z13.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "add x22, x14, %x[ld_in_row], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa14149e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+ "ld1w { z4.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z8.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+ "ld1w { z15.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc1301b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa0414aa6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc13c1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z12.s\n"
+ "ld1w { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x10]\n"
- "ld1w { z14.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+ ".inst 0xa1404aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "ld1w { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc13c19e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z12.s\n"
+ ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+ "st1w { z9.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "st1w { z26.s }, p1, [x26]\n"
+ ".inst 0xa1414aa6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ ".inst 0xc13f1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z15.s\n"
+ "ld1w { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
"add x26, x26, x24, LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- "st1w { z27.s }, p1, [x25]\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404aae // ld1w { z14.s-z15.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13f1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z15.s\n"
+ ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc13c1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xa0404aac // ld1w { z12.s-z13.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13d1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z13.s\n"
+ ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13f1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z15.s\n"
+ ".inst 0xa0414aaa // ld1w { z10.s-z11.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z11.s }, p1/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -510,654 +510,654 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z9.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1371ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z7.s\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1341980 // fmla za.s[x8, 0], { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1361940 // fmla za.s[x8, 0], { z10.s-z13.s }, z6.s\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1351980 // fmla za.s[x8, 0], { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc13f1940 // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
+ ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1321aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1311aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361980 // fmla za.s[x8, 0], { z12.s-z15.s }, z6.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
- ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13a1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13619a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z6.s\n"
+ ".inst 0xc1301a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z0.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z19.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0xc1311900 // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1901 // fmla za.s[x8, 1], { z8.s-z11.s }, z15.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
+ ".inst 0xc1371921 // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13d1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z13.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"17:" // Padded: 0 priming loads
"cmp x16, #0x2\n"
"blt 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x21]\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
"sub x16, x16, #0x2\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"sub x13, x13, #0x1\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
"lsr x20, x16, #0x1\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"cmp x20, x13\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "csel x22, x20, x13, LT\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "csel x23, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
"and x16, x16, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 19f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x22, x14, %x[ld_in_row], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z5.s\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
+ "ld1w { z0.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13c1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z12.s\n"
+ "ld1w { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ "ld1w { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z10.s\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc13e1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z14.s\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "ld1w { z14.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x10]\n"
+ "st1w { z24.s }, p1, [x11]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "add x10, x10, x28, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "st1w { z25.s }, p1, [x9]\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1311a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z1.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z25.s }, p1, [x10]\n"
+ "ld1w { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311980 // fmla za.s[x8, 0], { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1391981 // fmla za.s[x8, 1], { z12.s-z15.s }, z9.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x26, x26, x24, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404a8a // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13b1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z11.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z27.s }, p1, [x25]\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "st1w { z27.s }, p1, [x26]\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc13919a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z9.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "ld1w { z21.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xa0404a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
"ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1311ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z1.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xa0414a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1371b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z7.s\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x10]\n"
+ "st1w { z8.s }, p1, [x11]\n"
"mov x12, #0x8\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "add x10, x10, x28, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "st1w { z25.s }, p1, [x9]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z15.s\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
+ ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z8.s\n"
+ ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x26, x26, x24, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z27.s }, p1, [x25]\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ ".inst 0xa1404a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13c1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z12.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa1414a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1311a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"20:" // Main loop skip tail
"cbz x16, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x14]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
"ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+ ".inst 0xc13a1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
"sub x13, x13, #0x1\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+ ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1371ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc13a1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z10.s\n"
"ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1301b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x10]\n"
+ ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+ "st1w { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1301a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z0.s\n"
+ "st1w { z25.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
+ ".inst 0xc1381a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z8.s\n"
"add x8, x8, #0x1\n"
- "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z27.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z27.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"21:" // Tail input: End
"cbz x13, 23f\n"
"22:" // Right padding loop
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x10]\n"
- "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "st1w { z25.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x26]\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z11.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z27.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
"bgt 22b\n"
"23:" // End
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x15, ALL, MUL #16\n"
- "incb x15, ALL, MUL #9\n"
- "str x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x17\n"
"whilelt p1.s, x17, x7\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
index f09c61667f..50ef6c3815 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
index 5570b27644..be82e04613 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -69,102 +69,102 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
- "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z25.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z22.s, #0x0\n"
+ "fmov z26.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z22.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "fmov z9.s, #0x0\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "incb x20\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- ".inst 0x648aab29 // bfcvtnt z9.h, p2/M, z25.s\n"
- "incb x21, ALL, MUL #3\n"
- "ld1w { z21.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aab28 // bfcvt z8.h, p2/M, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
- ".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
- "fmov z2.s, #0x0\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aab21 // bfcvt z1.h, p2/M, z25.s\n"
- ".inst 0x648aab68 // bfcvtnt z8.h, p2/M, z27.s\n"
- "incb x20\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x648aaaa6 // bfcvtnt z6.h, p2/M, z21.s\n"
- ".inst 0x658aaaa5 // bfcvt z5.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x648aab22 // bfcvtnt z2.h, p2/M, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "fmov z6.s, #0x0\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "incb x21\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ ".inst 0x648aa9e6 // bfcvtnt z6.h, p2/M, z15.s\n"
+ "incb x20, ALL, MUL #3\n"
+ "ld1w { z30.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x658aa9e5 // bfcvt z5.h, p2/M, z15.s\n"
+ "ld1w { z14.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaba8 // bfcvt z8.h, p2/M, z29.s\n"
+ "fmov z11.s, #0x0\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa9ca // bfcvt z10.h, p2/M, z14.s\n"
+ ".inst 0x648aaba5 // bfcvtnt z5.h, p2/M, z29.s\n"
+ "incb x21\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x648aabc8 // bfcvtnt z8.h, p2/M, z30.s\n"
+ ".inst 0x658aabcc // bfcvt z12.h, p2/M, z30.s\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x648aa9cb // bfcvtnt z11.h, p2/M, z14.s\n"
+ "ld1w { z20.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #3\n"
- ".inst 0x648aab61 // bfcvtnt z1.h, p2/M, z27.s\n"
- ".inst 0x658aab6c // bfcvt z12.h, p2/M, z27.s\n"
- "ld1w { z27.s }, p2/Z, [x21]\n"
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ ".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
+ ".inst 0x658aab09 // bfcvt z9.h, p2/M, z24.s\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
"incb x21, ALL, MUL #3\n"
- "fmov z7.s, #0x0\n"
- ".inst 0x658aab24 // bfcvt z4.h, p2/M, z25.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
- ".inst 0x648aaaac // bfcvtnt z12.h, p2/M, z21.s\n"
- "sub x20, x15, #0x1\n"
+ "fmov z14.s, #0x0\n"
+ ".inst 0x658aaa81 // bfcvt z1.h, p2/M, z20.s\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0x658aa9e7 // bfcvt z7.h, p2/M, z15.s\n"
+ ".inst 0x648aab89 // bfcvtnt z9.h, p2/M, z28.s\n"
+ "sub x20, x14, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
- ".inst 0x658aaaaa // bfcvt z10.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x21]\n"
- "orr x23, x17, x23, LSL #20\n"
+ ".inst 0x658aab84 // bfcvt z4.h, p2/M, z28.s\n"
+ "ld1w { z29.s }, p2/Z, [x21]\n"
+ "orr x23, x16, x23, LSL #20\n"
"mov x22, #0x6\n"
- "add x21, x7, x6\n"
+ "add x21, x17, x7\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z23.d, z22.d\n"
- ".inst 0x648aab27 // bfcvtnt z7.h, p2/M, z25.s\n"
- ".inst 0x648aab64 // bfcvtnt z4.h, p2/M, z27.s\n"
- ".inst 0x648aaaa0 // bfcvtnt z0.h, p2/M, z21.s\n"
+ "mov z27.d, z26.d\n"
+ ".inst 0x648aaa8e // bfcvtnt z14.h, p2/M, z20.s\n"
+ ".inst 0x648aa9e1 // bfcvtnt z1.h, p2/M, z15.s\n"
+ ".inst 0x648aaba7 // bfcvtnt z7.h, p2/M, z29.s\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0x658aaaa3 // bfcvt z3.h, p2/M, z21.s\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
"lsl x23, x23, #0x2\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040ac0 // mova za.d[x8, #0], { z22.d-z23.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040b40 // mova za.d[x8, #0], { z26.d-z27.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040ac1 // mova za.d[x8, #1], { z22.d-z23.d }\n"
+ ".inst 0xc0040b41 // mova za.d[x8, #1], { z26.d-z27.d }\n"
"mov x10, #0x2\n"
- "ldp x9, x28, [x11], #0x10\n"
- ".inst 0xc0040ac2 // mova za.d[x8, #2], { z22.d-z23.d }\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040b42 // mova za.d[x8, #2], { z26.d-z27.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0040ac3 // mova za.d[x8, #3], { z22.d-z23.d }\n"
+ ".inst 0xc0040b43 // mova za.d[x8, #3], { z26.d-z27.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "ldp x25, x24, [x11], #0x10\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "ldp x25, x24, [x22], #0x10\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -172,389 +172,389 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "sub x13, x13, x21\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
+ ".inst 0xc0060814 // mova { z20.d-z21.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc0060836 // mova { z22.d-z23.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcb34 // fclamp { z20.s-z23.s }, z25.s, z13.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "st1w { z20.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z22.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z25.s }, p1, [x25]\n"
+ "st1w { z21.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z23.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x10, 8f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa3e // bfcvt z30.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aab9e // bfcvtnt z30.h, p2/M, z28.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa1f // bfcvt z31.h, p2/M, z16.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc12811b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z8.h\n"
+ ".inst 0x648aa9ff // bfcvtnt z31.h, p2/M, z15.s\n"
+ ".inst 0xc12513d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc12911b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12511d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x658aaa00 // bfcvt z0.h, p2/M, z16.s\n"
+ ".inst 0xc12613d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z6.h\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa9e0 // bfcvtnt z0.h, p2/M, z15.s\n"
+ ".inst 0xc12c13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z12.h\n"
+ ".inst 0xc12813f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z8.h\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p1/Z, [x13]\n"
+ ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc12111b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ ".inst 0xc12a11f0 // bfdot za.s[x8, 0], { z15.h-z16.h }, z10.h\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc12211b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12811b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z8.h\n"
- ".inst 0xc12911b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z9.h\n"
- ".inst 0xc12a11d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x658aaad1 // bfcvt z17.h, p2/M, z22.s\n"
+ ".inst 0xc12b11f1 // bfdot za.s[x8, 1], { z15.h-z16.h }, z11.h\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
+ ".inst 0xc12511f2 // bfdot za.s[x8, 2], { z15.h-z16.h }, z5.h\n"
+ ".inst 0xc12611f3 // bfdot za.s[x8, 3], { z15.h-z16.h }, z6.h\n"
+ ".inst 0xc1241210 // bfdot za.s[x8, 0], { z16.h-z17.h }, z4.h\n"
+ ".inst 0xc1291211 // bfdot za.s[x8, 1], { z16.h-z17.h }, z9.h\n"
+ ".inst 0xc12c1212 // bfdot za.s[x8, 2], { z16.h-z17.h }, z12.h\n"
+ ".inst 0xc1281213 // bfdot za.s[x8, 3], { z16.h-z17.h }, z8.h\n"
"8:" // Unpadded: 0 priming loads
- "cbz x15, 16f\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "sub x15, x15, #0x1\n"
+ "cbz x14, 16f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "cmp x15, x13\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa817 // bfcvt z23.h, p2/M, z0.s\n"
+ "cmp x14, x11\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x15, x13, LT\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "csel x21, x14, x11, LT\n"
+ ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aa818 // bfcvt z24.h, p2/M, z0.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x13, x13, x21\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x13]\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
"ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
"ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
"ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
- ".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- ".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ ".inst 0x648aaa96 // bfcvtnt z22.h, p2/M, z20.s\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
+ ".inst 0x658aaa38 // bfcvt z24.h, p2/M, z17.s\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x10, 13f\n"
"cmp x10, #0x1\n"
- "sub x15, x15, x10\n"
+ "sub x14, x14, x10\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12811b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z8.h\n"
+ ".inst 0xc1251290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12911b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z9.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12511d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1261291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z6.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12c12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+ ".inst 0xc12812b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z8.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12111b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12a1270 // bfdot za.s[x8, 0], { z19.h-z20.h }, z10.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12211b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z2.h\n"
- ".inst 0xc12811b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z8.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12911b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z9.h\n"
- ".inst 0xc12a11d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z6.h\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xc12b1271 // bfdot za.s[x8, 1], { z19.h-z20.h }, z11.h\n"
+ ".inst 0xc1251272 // bfdot za.s[x8, 2], { z19.h-z20.h }, z5.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1261273 // bfdot za.s[x8, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xc1241290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xc1291291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z9.h\n"
+ ".inst 0xc12c1292 // bfdot za.s[x8, 2], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc1281293 // bfdot za.s[x8, 3], { z20.h-z21.h }, z8.h\n"
"13:" // Padded: 0 priming loads
- "cbz x15, 16f\n"
+ "cbz x14, 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "sub x15, x15, #0x1\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x13, x13, #0x1\n"
- "cmp x15, x13\n"
- "csel x21, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "sub x13, x13, x21\n"
+ "sub x14, x14, #0x1\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "sub x11, x11, #0x1\n"
+ "cmp x14, x11\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
- ".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x13]\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
"ld1w { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- ".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- ".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- ".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
+ ".inst 0x648aaa76 // bfcvtnt z22.h, p2/M, z19.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
- ".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
- ".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
- ".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
- ".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- "st1w { z26.s }, p1, [x28]\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ "st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ "st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ "st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"16:" // Main loop skip tail
- "cbz x13, 18f\n"
+ "cbz x11, 18f\n"
"17:" // Right padding loop
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc006081c // mova { z28.d-z29.d }, za.d[x8, #0]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc006083e // mova { z30.d-z31.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x9]\n"
+ ".inst 0xc1adcb3c // fclamp { z28.s-z31.s }, z25.s, z13.s\n"
+ "st1w { z28.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z26.s }, p1, [x28]\n"
+ "st1w { z30.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x25]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z29.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x24]\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
index 89b9199084..e685884762 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
index e8c9bfeb29..a3b9ca402a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -69,89 +69,89 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
- "ld1rw { z27.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
+ "ld1rw { z4.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z23.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "ld1rw { z1.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z4.s, #0x0\n"
+ "fmov z24.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z4.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "ld1w { z19.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "incb x20\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aaa69 // bfcvt z9.h, p2/M, z19.s\n"
- "ld1w { z12.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x648aab09 // bfcvtnt z9.h, p2/M, z24.s\n"
- "incb x20\n"
- "ld1w { z19.s }, p2/Z, [x21]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "incb x21\n"
+ "ld1w { z23.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
+ "ld1w { z6.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x648aaaee // bfcvtnt z14.h, p2/M, z23.s\n"
+ "incb x21\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa8c3 // bfcvt z3.h, p2/M, z6.s\n"
+ ".inst 0x658aab88 // bfcvt z8.h, p2/M, z28.s\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ ".inst 0x648aa948 // bfcvtnt z8.h, p2/M, z10.s\n"
+ "ld1w { z2.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x658aa847 // bfcvt z7.h, p2/M, z2.s\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z9.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #3\n"
- ".inst 0x658aa983 // bfcvt z3.h, p2/M, z12.s\n"
- ".inst 0x658aaa62 // bfcvt z2.h, p2/M, z19.s\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- ".inst 0x648aab02 // bfcvtnt z2.h, p2/M, z24.s\n"
- "ld1w { z12.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aa980 // bfcvt z0.h, p2/M, z12.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z19.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aaa6a // bfcvt z10.h, p2/M, z19.s\n"
- "sub x20, x15, #0x1\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
+ ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
+ "sub x20, x14, #0x1\n"
+ "ld1w { z6.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #3\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
- "mov z5.d, z4.d\n"
- "ld1w { z12.s }, p2/Z, [x21]\n"
- "orr x23, x17, x23, LSL #20\n"
+ "mov z25.d, z24.d\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "orr x23, x16, x23, LSL #20\n"
"mov x22, #0x9\n"
- "mov z6.d, z4.d\n"
- "add x21, x7, x6\n"
+ "mov z26.d, z24.d\n"
+ "add x21, x17, x7\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "mov z7.d, z4.d\n"
- ".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
- ".inst 0x658aa981 // bfcvt z1.h, p2/M, z12.s\n"
+ "mov z27.d, z24.d\n"
+ ".inst 0x648aa8c0 // bfcvtnt z0.h, p2/M, z6.s\n"
+ ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
"mov x8, #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "madd x20, x20, x17, x13\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040c80 // mova za.d[x8, #0], { z4.d-z7.d }\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c81 // mova za.d[x8, #1], { z4.d-z7.d }\n"
+ ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
"mov x22, #0x2\n"
- "ldp x10, x9, [x11], #0x10\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ldp x28, x27, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x11], #0x10\n"
+ "ldp x26, x25, [x23], #0x10\n"
"ldp x24, x23, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
@@ -159,396 +159,396 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
"lsr x21, x21, #0x1\n"
- "sub x13, x13, x21\n"
+ "sub x11, x11, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z28.s }, p1, [x10]\n"
+ "st1w { z16.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z29.s }, p1, [x9]\n"
+ "st1w { z17.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x17, x7\n"
"bne 10f\n"
"cbz x22, 8f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa53 // bfcvt z19.h, p2/M, z18.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaaf4 // bfcvt z20.h, p2/M, z23.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa854 // bfcvtnt z20.h, p2/M, z2.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
"ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaace // bfcvt z14.h, p2/M, z22.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaad5 // bfcvtnt z21.h, p2/M, z22.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1331190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z3.h\n"
+ ".inst 0x658aabd6 // bfcvt z22.h, p2/M, z30.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa996 // bfcvtnt z22.h, p2/M, z12.s\n"
+ ".inst 0xc13e1270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z14.h\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
+ ".inst 0xc1331290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
"7:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaad0 // bfcvtnt z16.h, p2/M, z22.s\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aab91 // bfcvt z17.h, p2/M, z28.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa852 // bfcvt z18.h, p2/M, z2.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa853 // bfcvt z19.h, p2/M, z2.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
+ ".inst 0xc1381210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z8.h\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa954 // bfcvt z20.h, p2/M, z10.s\n"
+ ".inst 0xc1371230 // bfdot za.s[x8, 0], { z17.h-z20.h }, z7.h\n"
"8:" // Unpadded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "sub x15, x15, #0x2\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "sub x13, x13, #0x1\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "lsr x20, x15, #0x1\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "lsr x20, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "cmp x20, x13\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "cmp x20, x11\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "csel x22, x20, x13, LT\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "csel x22, x20, x11, LT\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "and x15, x15, #0x1\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "and x14, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "sub x13, x13, x22\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "sub x11, x11, x22\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
"cbz x22, 15f\n"
"9:" // Unpadded: Main loop
- "add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
"ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+ ".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- ".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aabcb // bfcvt z11.h, p2/M, z30.s\n"
+ ".inst 0x648aa9e9 // bfcvtnt z9.h, p2/M, z15.s\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
+ ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa84c // bfcvtnt z12.h, p2/M, z2.s\n"
"add x8, x8, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x14]\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p1/Z, [x13]\n"
+ ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+ ".inst 0x658aaba9 // bfcvt z9.h, p2/M, z29.s\n"
"subs x22, x22, #0x1\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+ "st1w { z16.s }, p1, [x10]\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
"add x10, x10, x28, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "st1w { z29.s }, p1, [x9]\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0x658aab8a // bfcvt z10.h, p2/M, z28.s\n"
+ "st1w { z17.s }, p1, [x9]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z18.s }, p1, [x26]\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
"add x26, x26, x24, LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa8b // bfcvtnt z11.h, p2/M, z20.s\n"
- ".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaac9 // bfcvtnt z9.h, p2/M, z22.s\n"
+ ".inst 0x648aabea // bfcvtnt z10.h, p2/M, z31.s\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aabed // bfcvt z13.h, p2/M, z31.s\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
"cbz x22, 13f\n"
"cmp x22, #0x1\n"
- "sub x15, x15, x22\n"
+ "sub x14, x14, x22\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x658aa98a // bfcvt z10.h, p2/M, z12.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa98a // bfcvtnt z10.h, p2/M, z12.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1331190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z3.h\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0xc13e1130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z14.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa70 // bfcvtnt z16.h, p2/M, z19.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa9b1 // bfcvt z17.h, p2/M, z13.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aa991 // bfcvtnt z17.h, p2/M, z12.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa932 // bfcvt z18.h, p2/M, z9.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaab3 // bfcvt z19.h, p2/M, z21.s\n"
+ ".inst 0xc13811f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z8.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1371210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z7.h\n"
"13:" // Padded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x14, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "sub x15, x15, #0x2\n"
- "sub x13, x13, #0x1\n"
- "lsr x20, x15, #0x1\n"
- "cmp x20, x13\n"
- "csel x21, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "and x15, x15, #0x1\n"
- "sub x13, x13, x21\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x2\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x21, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "sub x11, x11, x21\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x14]\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa49 // bfcvt z9.h, p2/M, z18.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa84b // bfcvt z11.h, p2/M, z2.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aaa29 // bfcvtnt z9.h, p2/M, z17.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aab8c // bfcvt z12.h, p2/M, z28.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa6a // bfcvtnt z10.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa9eb // bfcvtnt z11.h, p2/M, z15.s\n"
"mov x12, #0x0\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
@@ -563,197 +563,197 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"st1w { z30.s }, p1, [x26]\n"
"add x8, x8, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2c // bfcvt z12.h, p2/M, z17.s\n"
+ ".inst 0x658aaa2a // bfcvt z10.h, p2/M, z17.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
"ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa6d // bfcvt z13.h, p2/M, z19.s\n"
+ ".inst 0x658aaa6b // bfcvt z11.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"subs x21, x21, #0x1\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z31.s }, p1, [x25]\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"add x9, x9, x27, LSL #2\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0x648aaaab // bfcvtnt z11.h, p2/M, z21.s\n"
- ".inst 0x648aaa8c // bfcvtnt z12.h, p2/M, z20.s\n"
+ ".inst 0x648aaaa9 // bfcvtnt z9.h, p2/M, z21.s\n"
+ ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
"bgt 14b\n"
"15:" // Main loop tail
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x14]\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa32 // bfcvt z18.h, p2/M, z17.s\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aa852 // bfcvtnt z18.h, p2/M, z2.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
+ ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa9f4 // bfcvtnt z20.h, p2/M, z15.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1381250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z8.h\n"
+ ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
"st1w { z28.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z29.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"st1w { z30.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ ".inst 0xc1371270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z7.h\n"
"st1w { z31.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"16:" // Main loop skip tail
- "cbz x15, 17f\n" // Skip remainder inputs
+ "cbz x14, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa850 // bfcvtnt z16.h, p2/M, z2.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa951 // bfcvt z17.h, p2/M, z10.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aabd1 // bfcvtnt z17.h, p2/M, z30.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa72 // bfcvt z18.h, p2/M, z19.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x10]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
+ ".inst 0xc13011f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z0.h\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+ ".inst 0xc13e11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z14.h\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "st1w { z8.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
+ ".inst 0xc1331211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1w { z29.s }, p1, [x9]\n"
+ "st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z10.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "st1w { z31.s }, p1, [x25]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "st1w { z11.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
- "cbz x13, 19f\n"
+ "cbz x11, 19f\n"
"18:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x13, x13, #0x1\n"
- ".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x10]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "st1w { z8.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "st1w { z29.s }, p1, [x9]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
+ "st1w { z10.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z11.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 18b\n"
"19:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20, LSL #2\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
"ldp x21, x20, [x24, #0x0]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x0]\n"
- "ldp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "stp x23, x22, [x11, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
index c2d439fe78..5215ccaf39 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
index 2b3a247686..b72042558d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -73,237 +73,237 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
"sub x20, x20, x4\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z26.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x6\n"
+ "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z31.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x5\n"
+ "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
- "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z24.s, #0x0\n"
- "cbz x20, 2f\n"
- "ld1w { z24.s }, p1/Z, [x20, x7, LSL #2]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z30.s, #0x0\n"
+ "cbz x21, 2f\n"
+ "ld1w { z30.s }, p1/Z, [x21, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "fmov z4.s, #0x0\n"
- "incb x20\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "fmov z11.s, #0x0\n"
+ "incb x21\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa99a // bfcvt z26.h, p2/M, z12.s\n"
+ ".inst 0x658aab10 // bfcvt z16.h, p2/M, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
"addvl x24, SP, #30\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
+ ".inst 0x648aa98b // bfcvtnt z11.h, p2/M, z12.s\n"
+ "ld1w { z25.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x658aa875 // bfcvt z21.h, p2/M, z3.s\n"
"addvl x24, x24, #-6\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "incb x20\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "mov x21, x20\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "incb x20\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x24]\n"
+ ".inst 0x648aab1a // bfcvtnt z26.h, p2/M, z24.s\n"
+ "ld1w { z14.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "fmov z11.s, #0x0\n"
+ "st1h { z26.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aa870 // bfcvtnt z16.h, p2/M, z3.s\n"
+ "ld1w { z19.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa8c9 // bfcvt z9.h, p2/M, z6.s\n"
+ ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
+ "incb x21\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z16.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aab3b // bfcvtnt z27.h, p2/M, z25.s\n"
+ ".inst 0x658aab37 // bfcvt z23.h, p2/M, z25.s\n"
+ "ld1w { z5.s }, p2/Z, [x20]\n"
+ ".inst 0x658aa9c8 // bfcvt z8.h, p2/M, z14.s\n"
+ "mov x23, x21\n"
+ "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x648aa8cb // bfcvtnt z11.h, p2/M, z6.s\n"
+ ".inst 0x658aaa79 // bfcvt z25.h, p2/M, z19.s\n"
+ "ld1w { z4.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ "st1h { z27.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x648aa9c9 // bfcvtnt z9.h, p2/M, z14.s\n"
+ ".inst 0x658aa991 // bfcvt z17.h, p2/M, z12.s\n"
+ "incb x21\n"
+ "st1h { z23.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x24]\n"
- "fmov z4.s, #0x0\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x24]\n"
+ "fmov z2.s, #0x0\n"
+ ".inst 0x648aaa68 // bfcvtnt z8.h, p2/M, z19.s\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ ".inst 0x658aa893 // bfcvt z19.h, p2/M, z4.s\n"
+ "st1h { z8.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aa999 // bfcvtnt z25.h, p2/M, z12.s\n"
+ "ld1w { z7.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
+ ".inst 0x648aa8b1 // bfcvtnt z17.h, p2/M, z5.s\n"
+ "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ ".inst 0x658aa8ab // bfcvt z11.h, p2/M, z5.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "mov x20, x21\n"
+ ".inst 0x648aa882 // bfcvtnt z2.h, p2/M, z4.s\n"
+ ".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z17.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z11.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aab53 // bfcvtnt z19.h, p2/M, z26.s\n"
+ ".inst 0x658aa8fa // bfcvt z26.h, p2/M, z7.s\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x24]\n"
+ ".inst 0x648aab6e // bfcvtnt z14.h, p2/M, z27.s\n"
+ "ld1w { z4.s }, p2/Z, [x20]\n"
+ "fmov z21.s, #0x0\n"
+ "st1h { z19.h }, p2, [x24, #1, MUL VL]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa9ea // bfcvt z10.h, p2/M, z15.s\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aa8e6 // bfcvtnt z6.h, p2/M, z7.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa973 // bfcvt z19.h, p2/M, z11.s\n"
+ "st1h { z6.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x648aaa5a // bfcvtnt z26.h, p2/M, z18.s\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "incb x20\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x658aa897 // bfcvt z23.h, p2/M, z4.s\n"
+ ".inst 0x648aa9f5 // bfcvtnt z21.h, p2/M, z15.s\n"
+ "ld1w { z24.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
+ ".inst 0x648aa96a // bfcvtnt z10.h, p2/M, z11.s\n"
"ld1w { z3.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "st1h { z26.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x648aa893 // bfcvtnt z19.h, p2/M, z4.s\n"
+ ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "ld1w { z2.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aaa37 // bfcvtnt z23.h, p2/M, z17.s\n"
+ "ld1w { z26.s }, p2/Z, [x21]\n"
+ "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
+ "st1h { z21.h }, p2, [x24]\n"
+ ".inst 0x648aa990 // bfcvtnt z16.h, p2/M, z12.s\n"
"incb x21, ALL, MUL #5\n"
- "fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "sub x20, x17, #0x1\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
+ "fmov z8.s, #0x0\n"
+ "st1h { z10.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aab04 // bfcvt z4.h, p2/M, z24.s\n"
+ ".inst 0x658aa985 // bfcvt z5.h, p2/M, z12.s\n"
+ "sub x20, x25, #0x1\n"
+ "st1h { z19.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x658aa871 // bfcvt z17.h, p2/M, z3.s\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "orr x23, x6, x23, LSL #20\n"
+ "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x658aa857 // bfcvt z23.h, p2/M, z2.s\n"
+ "orr x23, x7, x23, LSL #20\n"
"mov x22, #0x8\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "add x21, x5, x4\n"
+ "st1h { z16.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
+ "add x21, x6, x4\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
"addvl x24, x24, #-6\n"
- "mov z25.d, z24.d\n"
- ".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- ".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
+ "mov z31.d, z30.d\n"
+ ".inst 0x648aab08 // bfcvtnt z8.h, p2/M, z24.s\n"
+ "st1h { z8.h }, p2, [x24]\n"
+ ".inst 0x648aa864 // bfcvtnt z4.h, p2/M, z3.s\n"
+ ".inst 0x648aa851 // bfcvtnt z17.h, p2/M, z2.s\n"
"mov x11, #0x0\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
+ "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aab57 // bfcvtnt z23.h, p2/M, z26.s\n"
+ ".inst 0x648aab2e // bfcvtnt z14.h, p2/M, z25.s\n"
"mov x8, #0x8\n"
- "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "st1h { z17.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x658aab26 // bfcvt z6.h, p2/M, z25.s\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
- "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
+ "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
"sub x22, x22, x21\n"
- "madd x20, x20, x5, x16\n"
- "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "madd x20, x20, x6, x16\n"
+ "st1h { z14.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z6.h }, p2, [x24, #5, MUL VL]\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- "msub x16, x5, x20, x16\n"
- ".inst 0xc0046b00 // mova za.d[x11, #0], { z24.d-z25.d }\n"
+ "msub x16, x6, x20, x16\n"
+ ".inst 0xc0046bc0 // mova za.d[x11, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046b01 // mova za.d[x11, #1], { z24.d-z25.d }\n"
+ ".inst 0xc0046bc1 // mova za.d[x11, #1], { z30.d-z31.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc0046b02 // mova za.d[x11, #2], { z24.d-z25.d }\n"
- "ldp x0, x10, [x20], #0x10\n"
- ".inst 0xc0046b03 // mova za.d[x11, #3], { z24.d-z25.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046bc2 // mova za.d[x11, #2], { z30.d-z31.d }\n"
+ "ldp x5, x10, [x20], #0x10\n"
+ ".inst 0xc0046bc3 // mova za.d[x11, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046b04 // mova za.d[x11, #4], { z24.d-z25.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc0046b05 // mova za.d[x11, #5], { z24.d-z25.d }\n"
+ ".inst 0xc0046bc4 // mova za.d[x11, #4], { z30.d-z31.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046bc5 // mova za.d[x11, #5], { z30.d-z31.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0046b06 // mova za.d[x11, #6], { z24.d-z25.d }\n"
- ".inst 0xc0046b07 // mova za.d[x11, #7], { z24.d-z25.d }\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
+ ".inst 0xc0046bc6 // mova za.d[x11, #6], { z30.d-z31.d }\n"
+ ".inst 0xc0046bc7 // mova za.d[x11, #7], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
+ ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba4 // fclamp { z4.s-z7.s }, z29.s, z28.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
+ "st1w { z4.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z6.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z1.s }, p1, [x9]\n"
+ "st1w { z5.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z3.s }, p1, [x28]\n"
+ "st1w { z7.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x5, x4\n"
+ "adds XZR, x6, x4\n"
"bne 12f\n"
"cbz x22, 10f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 9f\n"
"cmp x22, #0x2\n"
"beq 8f\n"
@@ -311,335 +311,335 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z21.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaab2 // bfcvt z18.h, p2/M, z21.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
+ "ld1w { z7.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa8f4 // bfcvt z20.h, p2/M, z7.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa994 // bfcvtnt z20.h, p2/M, z12.s\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12d7250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z13.h\n"
+ "ld1w { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x658aa8d5 // bfcvt z21.h, p2/M, z6.s\n"
+ ".inst 0xc12c7251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z12.h\n"
+ ".inst 0xa0412a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12b7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z11.h\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ ".inst 0xc12a7271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z10.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12b7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z11.h\n"
+ ".inst 0xc12a7291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z10.h\n"
"7:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p1/Z, [x16]\n"
+ ".inst 0x658aa8d7 // bfcvt z23.h, p2/M, z6.s\n"
"addvl x21, SP, #18\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa837 // bfcvtnt z23.h, p2/M, z1.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f8 // bfcvt z24.h, p2/M, z15.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
+ "ld1w { z9.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12972f0 // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
+ ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
+ ".inst 0xc12172f1 // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72f2 // bfdot za.s[x11, 2], { z23.h-z24.h }, z15.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12772f3 // bfdot za.s[x11, 3], { z23.h-z24.h }, z7.h\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x648aaa1a // bfcvtnt z26.h, p2/M, z16.s\n"
+ ".inst 0xc1297310 // bfdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
+ ".inst 0xc1217311 // bfdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7312 // bfdot za.s[x11, 2], { z24.h-z25.h }, z15.h\n"
+ ".inst 0xc1277313 // bfdot za.s[x11, 3], { z24.h-z25.h }, z7.h\n"
+ ".inst 0xc12b7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z11.h\n"
+ ".inst 0xc1237331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1227333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
"8:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p1/Z, [x16]\n"
+ ".inst 0x658aab02 // bfcvt z2.h, p2/M, z24.s\n"
"addvl x22, SP, #12\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa02 // bfcvtnt z2.h, p2/M, z16.s\n"
"addvl x21, SP, #18\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa03 // bfcvt z3.h, p2/M, z16.s\n"
"addvl x20, SP, #24\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa03 // bfcvtnt z3.h, p2/M, z16.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z1.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa824 // bfcvt z4.h, p2/M, z1.s\n"
+ "ld1w { z19.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x648aaa64 // bfcvtnt z4.h, p2/M, z19.s\n"
+ ".inst 0xa1402ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12f7050 // bfdot za.s[x11, 0], { z2.h-z3.h }, z15.h\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x658aa805 // bfcvt z5.h, p2/M, z0.s\n"
+ ".inst 0xc1277051 // bfdot za.s[x11, 1], { z2.h-z3.h }, z7.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7052 // bfdot za.s[x11, 2], { z2.h-z3.h }, z15.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1277053 // bfdot za.s[x11, 3], { z2.h-z3.h }, z7.h\n"
+ "ld1w { z10.s }, p1/Z, [x23]\n"
+ ".inst 0x648aa945 // bfcvtnt z5.h, p2/M, z10.s\n"
+ ".inst 0xc12e7070 // bfdot za.s[x11, 0], { z3.h-z4.h }, z14.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1267071 // bfdot za.s[x11, 1], { z3.h-z4.h }, z6.h\n"
+ ".inst 0xa0412aac // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12f7054 // bfdot za.s[x11, 4], { z2.h-z3.h }, z15.h\n"
+ ".inst 0xa1422ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1277055 // bfdot za.s[x11, 5], { z2.h-z3.h }, z7.h\n"
+ ".inst 0xc12d7072 // bfdot za.s[x11, 2], { z3.h-z4.h }, z13.h\n"
+ ".inst 0xc12c7073 // bfdot za.s[x11, 3], { z3.h-z4.h }, z12.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1287090 // bfdot za.s[x11, 0], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xc1207091 // bfdot za.s[x11, 1], { z4.h-z5.h }, z0.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7074 // bfdot za.s[x11, 4], { z3.h-z4.h }, z15.h\n"
+ ".inst 0xc12e7075 // bfdot za.s[x11, 5], { z3.h-z4.h }, z14.h\n"
+ ".inst 0xc1277092 // bfdot za.s[x11, 2], { z4.h-z5.h }, z7.h\n"
+ ".inst 0xc1267093 // bfdot za.s[x11, 3], { z4.h-z5.h }, z6.h\n"
+ ".inst 0xa1422a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1287094 // bfdot za.s[x11, 4], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xc1207095 // bfdot za.s[x11, 5], { z4.h-z5.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaa4c // bfcvt z12.h, p2/M, z18.s\n"
"addvl x23, SP, #6\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z7.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa8ec // bfcvtnt z12.h, p2/M, z7.s\n"
"addvl x22, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z20.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa8d // bfcvt z13.h, p2/M, z20.s\n"
"addvl x21, SP, #18\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z0.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aa80d // bfcvtnt z13.h, p2/M, z0.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z10.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa94e // bfcvt z14.h, p2/M, z10.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ "ld1w { z0.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
+ ".inst 0x648aa80e // bfcvtnt z14.h, p2/M, z0.s\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1217190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z1.h\n"
+ "ld1w { z17.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- "ld1w { z16.s }, p1/Z, [x24]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
+ ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc1207191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z0.h\n"
+ ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12b7192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z11.h\n"
+ ".inst 0xa0412ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12a7193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z10.h\n"
+ "ld1w { z18.s }, p1/Z, [x24]\n"
+ ".inst 0x648aaa4f // bfcvtnt z15.h, p2/M, z18.s\n"
+ ".inst 0xc12171b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12071b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12a7194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z10.h\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1227195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z2.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12b71d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z11.h\n"
+ ".inst 0xc12a71d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z10.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1297196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z9.h\n"
+ ".inst 0xc1287197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z8.h\n"
+ ".inst 0xc12171b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12071b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12a71d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z10.h\n"
+ ".inst 0xc12271d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b71b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z11.h\n"
+ ".inst 0xc12371b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z3.h\n"
+ ".inst 0xc12771d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z7.h\n"
+ ".inst 0xc12671d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z6.h\n"
+ ".inst 0xa0422a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12771d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z7.h\n"
+ ".inst 0xc12671d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z6.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 20f\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 20f\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "sub x17, x17, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x16]\n"
+ ".inst 0x658aa834 // bfcvt z20.h, p2/M, z1.s\n"
+ "sub x25, x25, #0x1\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x15, x15, #0x1\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa954 // bfcvtnt z20.h, p2/M, z10.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "cmp x17, x15\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "cmp x25, x15\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x25, x17, x15, LT\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "csel x25, x25, x15, LT\n"
+ ".inst 0x648aaa75 // bfcvtnt z21.h, p2/M, z19.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
"sub x15, x15, x25\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aa9f7 // bfcvt z23.h, p2/M, z15.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
"cbz x25, 19f\n"
"11:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
"addvl x23, SP, #12\n"
- "ld1w { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24]\n"
+ "ld1w { z27.s }, p1/Z, [x16]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc1297292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z9.h\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1217293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412b06 // ld1h { z6.h-z7.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
"ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
"ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
"ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12e72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z14.h\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0x658aaaec // bfcvt z12.h, p2/M, z23.s\n"
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0x648aaacc // bfcvtnt z12.h, p2/M, z22.s\n"
- ".inst 0xc12611b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc12911d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- ".inst 0xc12811d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc12672b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12f72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12772d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xa1422ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12e72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xc12672b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12f72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12772d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xa0422ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12f72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12e72d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12c1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc1241291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z4.h\n"
+ ".inst 0x658aab74 // bfcvt z20.h, p2/M, z27.s\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12d12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z13.h\n"
+ ".inst 0x648aab54 // bfcvtnt z20.h, p2/M, z26.s\n"
+ ".inst 0xc12512b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc12912d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z9.h\n"
+ ".inst 0x648aab15 // bfcvtnt z21.h, p2/M, z24.s\n"
+ ".inst 0xc12112d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x658aaa76 // bfcvt z22.h, p2/M, z19.s\n"
+ ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aaa56 // bfcvtnt z22.h, p2/M, z18.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba8 // fclamp { z8.s-z11.s }, z29.s, z28.s\n"
+ "st1w { z8.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z10.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x9]\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "st1w { z3.s }, p1, [x28]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "st1w { z11.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
"cbz x22, 17f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 16f\n"
"cmp x22, #0x2\n"
"beq 15f\n"
@@ -649,449 +649,449 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa06 // bfcvt z6.h, p2/M, z16.s\n"
"add x21, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa06 // bfcvtnt z6.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa07 // bfcvt z7.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa07 // bfcvtnt z7.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa08 // bfcvt z8.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x20, SP, #24\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f70d0 // bfdot za.s[x11, 0], { z6.h-z7.h }, z15.h\n"
+ "ld1w { z9.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
+ ".inst 0xc12e70d1 // bfdot za.s[x11, 1], { z6.h-z7.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ ".inst 0xc12f70f0 // bfdot za.s[x11, 0], { z7.h-z8.h }, z15.h\n"
+ ".inst 0xc12e70f1 // bfdot za.s[x11, 1], { z7.h-z8.h }, z14.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237110 // bfdot za.s[x11, 0], { z8.h-z9.h }, z3.h\n"
+ ".inst 0xc1227111 // bfdot za.s[x11, 1], { z8.h-z9.h }, z2.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
"add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x21, SP, #18\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
"addvl x20, SP, #24\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc12e7153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc12d7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z13.h\n"
+ ".inst 0xc1257171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z5.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12f7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z15.h\n"
+ ".inst 0xc12e7173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z14.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
"add x23, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x22, SP, #12\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1297250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+ "ld1w { z26.s }, p0/Z, [x23]\n"
"addvl x21, SP, #18\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aab55 // bfcvt z21.h, p2/M, z26.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1217251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12e7252 // bfdot za.s[x11, 2], { z18.h-z19.h }, z14.h\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1267253 // bfdot za.s[x11, 3], { z18.h-z19.h }, z6.h\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z15.h\n"
+ ".inst 0xc1277271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z7.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa1422ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12d7254 // bfdot za.s[x11, 4], { z18.h-z19.h }, z13.h\n"
+ ".inst 0xc1257255 // bfdot za.s[x11, 5], { z18.h-z19.h }, z5.h\n"
+ ".inst 0xc12e7272 // bfdot za.s[x11, 2], { z19.h-z20.h }, z14.h\n"
+ ".inst 0xc1267273 // bfdot za.s[x11, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z15.h\n"
+ ".inst 0xc1277291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z7.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12d7274 // bfdot za.s[x11, 4], { z19.h-z20.h }, z13.h\n"
+ ".inst 0xc1257275 // bfdot za.s[x11, 5], { z19.h-z20.h }, z5.h\n"
+ ".inst 0xc12f7292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z15.h\n"
+ ".inst 0xc12e7293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z3.h\n"
+ ".inst 0xc1227295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z2.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
"add x24, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"addvl x23, SP, #6\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1402ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
"addvl x22, SP, #12\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12d7134 // bfdot za.s[x11, 4], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc1257135 // bfdot za.s[x11, 5], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc1267171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12d7136 // bfdot za.s[x11, 6], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc1257137 // bfdot za.s[x11, 7], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xc12f7154 // bfdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277155 // bfdot za.s[x11, 5], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc1267173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7156 // bfdot za.s[x11, 6], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277157 // bfdot za.s[x11, 7], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc1297174 // bfdot za.s[x11, 4], { z11.h-z12.h }, z9.h\n"
+ ".inst 0xc1217175 // bfdot za.s[x11, 5], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1217176 // bfdot za.s[x11, 6], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xc1207177 // bfdot za.s[x11, 7], { z11.h-z12.h }, z0.h\n"
"17:" // Padded: 0 priming loads
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 20f\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "sub x17, x17, #0x1\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ "sub x25, x25, #0x1\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
"sub x15, x15, x25\n"
"cbz x25, 19f\n"
"18:" // Padded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
"addvl x23, SP, #12\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- "ld1w { z23.s }, p0/Z, [x16]\n"
+ ".inst 0xc1237292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z3.h\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
"add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1227293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z2.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412b06 // ld1h { z6.h-z7.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- "ld1w { z21.s }, p0/Z, [x22]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z27.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- "ld1w { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z10.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12e7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z8.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc1267297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z11.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0x658aaaec // bfcvt z12.h, p2/M, z23.s\n"
- ".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0x648aaacc // bfcvtnt z12.h, p2/M, z22.s\n"
- ".inst 0xc12611b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
- ".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc12911d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- ".inst 0xc12811d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc12472d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422ae4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12e72b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc12472d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12172d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc12072d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+ ".inst 0x648aaa74 // bfcvtnt z20.h, p2/M, z19.s\n"
+ ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+ ".inst 0x658aaa35 // bfcvt z21.h, p2/M, z17.s\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ ".inst 0xc12012d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0x658aa956 // bfcvt z22.h, p2/M, z10.s\n"
+ ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
"add x8, x8, #0x2\n"
".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aa916 // bfcvtnt z22.h, p2/M, z8.s\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
+ ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
"st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
+ "add x14, x14, x5, LSL #2\n"
"st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
"st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 18b\n"
"19:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12611b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xc12911d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12811d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xc1217292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xc1207293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12672d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12672d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc12372d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc12312d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xc12212d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z2.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccbb4 // fclamp { z20.s-z23.s }, z29.s, z28.s\n"
+ "st1w { z20.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z22.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x9]\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z21.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- "st1w { z3.s }, p1, [x28]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "st1w { z23.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"20:" // Main loop skip tail
"cbz x15, 22f\n"
@@ -1100,16 +1100,16 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
+ ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
"st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x0, LSL #2\n"
+ "add x14, x14, x5, LSL #2\n"
"st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 21b\n"
@@ -1118,12 +1118,12 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x7\n"
- "whilelt p1.s, x7, x6\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20, LSL #2\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1141,7 +1141,7 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x0", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
index c99cf51da4..53e596418b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
index 01f689a0b4..3a56e69d26 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -76,134 +76,134 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z30.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z22.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z4.s, #0x0\n"
+ "fmov z16.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x20\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x20\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ld1w { z8.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "incb x21\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
"addvl x24, SP, #15\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aa90f // bfcvtnt z15.h, p2/M, z8.s\n"
"addvl x24, x24, #-3\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x20\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z15.h }, p2, [x24]\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aabb5 // bfcvt z21.h, p2/M, z29.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aaa58 // bfcvt z24.h, p2/M, z18.s\n"
+ "ld1w { z26.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aab41 // bfcvt z1.h, p2/M, z26.s\n"
+ ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z24.h }, p2, [x24, #2, MUL VL]\n"
"addvl x24, x24, #-3\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "incb x20\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z21.h }, p2, [x24]\n"
+ ".inst 0x648aaa21 // bfcvtnt z1.h, p2/M, z17.s\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "incb x21\n"
+ ".inst 0x658aa864 // bfcvt z4.h, p2/M, z3.s\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa92b // bfcvt z11.h, p2/M, z9.s\n"
+ "st1h { z1.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aaa46 // bfcvt z6.h, p2/M, z18.s\n"
+ "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
"addvl x24, x24, #-3\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x20\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aabe4 // bfcvtnt z4.h, p2/M, z31.s\n"
+ "ld1w { z27.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z4.h }, p2, [x24]\n"
+ ".inst 0x648aa8a6 // bfcvtnt z6.h, p2/M, z5.s\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa938 // bfcvt z24.h, p2/M, z9.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ "st1h { z6.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aaa38 // bfcvtnt z24.h, p2/M, z17.s\n"
+ ".inst 0x658aabf9 // bfcvt z25.h, p2/M, z31.s\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "mov x21, x20\n"
+ "st1h { z21.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
"addvl x24, x24, #-3\n"
- "st1h { z1.h }, p2, [x24]\n"
- "ld1w { z31.s }, p2/Z, [x21]\n"
+ "st1h { z24.h }, p2, [x24]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x21]\n"
+ ".inst 0x648aaa59 // bfcvtnt z25.h, p2/M, z18.s\n"
+ "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "ld1w { z13.s }, p2/Z, [x21]\n"
+ ".inst 0x658aaa29 // bfcvt z9.h, p2/M, z17.s\n"
+ ".inst 0x658aa976 // bfcvt z22.h, p2/M, z11.s\n"
+ "ld1w { z28.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
- ".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
+ ".inst 0x658aab85 // bfcvt z5.h, p2/M, z28.s\n"
"ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z18.s }, p2/Z, [x21]\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
"incb x21, ALL, MUL #5\n"
"sub x20, x7, #0x1\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
+ "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
"orr x23, x20, %x[ld_in_col], LSL #18\n"
"addvl x24, x24, #-3\n"
- "mov z5.d, z4.d\n"
+ "mov z17.d, z16.d\n"
"orr x23, x5, x23, LSL #20\n"
"mov x22, #0xb\n"
- "mov z6.d, z4.d\n"
- "mov z7.d, z4.d\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
"add x21, x4, x3\n"
"lsl x20, %x[ld_in_row], #0x2\n"
- ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "st1h { z1.h }, p2, [x24]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
+ ".inst 0x648aa909 // bfcvtnt z9.h, p2/M, z8.s\n"
+ "st1h { z9.h }, p2, [x24]\n"
+ ".inst 0x648aab25 // bfcvtnt z5.h, p2/M, z25.s\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aa97b // bfcvt z27.h, p2/M, z11.s\n"
"mov x8, #0x0\n"
- "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
"ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
"lsl x23, x23, #0x2\n"
"sub x22, x22, x21\n"
@@ -213,20 +213,20 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x2\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040c80 // mova za.d[x8, #0], { z4.d-z7.d }\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c81 // mova za.d[x8, #1], { z4.d-z7.d }\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040c83 // mova za.d[x8, #3], { z4.d-z7.d }\n"
+ ".inst 0xc0040e03 // mova za.d[x8, #3], { z16.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
@@ -234,21 +234,21 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
"lsr x21, x21, #0x1\n"
"sub x16, x16, x21\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z24.s }, p1, [x15]\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- "st1w { z25.s }, p1, [x14]\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z27.s }, p1, [x9]\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -264,331 +264,331 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x17]\n"
- ".inst 0x658aaaea // bfcvt z10.h, p2/M, z23.s\n"
+ "ld1w { z0.s }, p1/Z, [x17]\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
"addvl x20, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aa936 // bfcvtnt z22.h, p2/M, z9.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab97 // bfcvt z23.h, p2/M, z28.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aaa97 // bfcvtnt z23.h, p2/M, z20.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa98 // bfcvt z24.h, p2/M, z20.s\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aabb8 // bfcvtnt z24.h, p2/M, z29.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aabd9 // bfcvt z25.h, p2/M, z30.s\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0x648aa93a // bfcvtnt z26.h, p2/M, z9.s\n"
+ ".inst 0xc13b12f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z11.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa93b // bfcvt z27.h, p2/M, z9.s\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
"7:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab7d // bfcvt z29.h, p2/M, z27.s\n"
"addvl x20, SP, #9\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa93e // bfcvt z30.h, p2/M, z9.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aaa9e // bfcvtnt z30.h, p2/M, z20.s\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab3f // bfcvt z31.h, p2/M, z25.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab5f // bfcvtnt z31.h, p2/M, z26.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aa920 // bfcvtnt z0.h, p2/M, z9.s\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaae1 // bfcvt z1.h, p2/M, z23.s\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13413b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z4.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0x648aa921 // bfcvtnt z1.h, p2/M, z9.s\n"
+ ".inst 0xc13513d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z5.h\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13913f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
"8:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
"addvl x21, SP, #6\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aaaba // bfcvtnt z26.h, p2/M, z21.s\n"
"addvl x20, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aab3b // bfcvt z27.h, p2/M, z25.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa89b // bfcvtnt z27.h, p2/M, z4.s\n"
+ "ld1w { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa95c // bfcvt z28.h, p2/M, z10.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa89c // bfcvtnt z28.h, p2/M, z4.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa8bd // bfcvt z29.h, p2/M, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8bd // bfcvtnt z29.h, p2/M, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa8be // bfcvt z30.h, p2/M, z5.s\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13e1350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z14.h\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8be // bfcvtnt z30.h, p2/M, z5.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13f1370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z15.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1381351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z8.h\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaaff // bfcvt z31.h, p2/M, z23.s\n"
+ ".inst 0xc1391371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
"addvl x21, SP, #3\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
"addvl x20, SP, #9\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aabf8 // bfcvt z24.h, p2/M, z31.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8d8 // bfcvtnt z24.h, p2/M, z6.s\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aab99 // bfcvt z25.h, p2/M, z28.s\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aab9a // bfcvt z26.h, p2/M, z28.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa89a // bfcvtnt z26.h, p2/M, z4.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa9b // bfcvtnt z27.h, p2/M, z20.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc1381310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13212f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1w { z11.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa97c // bfcvt z28.h, p2/M, z11.s\n"
+ ".inst 0xc1331311 // bfdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1341330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301331 // bfdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
"sub x7, x7, #0x2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x16, x16, #0x1\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab55 // bfcvtnt z21.h, p2/M, z26.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aab56 // bfcvt z22.h, p2/M, z26.s\n"
"lsr x20, x7, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"cmp x20, x16\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab56 // bfcvtnt z22.h, p2/M, z26.s\n"
+ "ld1w { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aa917 // bfcvt z23.h, p2/M, z8.s\n"
"csel x26, x20, x16, LT\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa857 // bfcvtnt z23.h, p2/M, z2.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa8d8 // bfcvt z24.h, p2/M, z6.s\n"
"and x7, x7, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
"sub x16, x16, x26\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
"cbz x26, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- "ld1w { z18.s }, p1/Z, [x17]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b21 // ld1h { z1.h, z9.h }, pn10.b/Z, [x25]\n"
+ "ld1w { z14.s }, p1/Z, [x17]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa1402b20 // ld1h { z0.h, z8.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row], LSL #2\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc13812d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z8.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"addvl x21, SP, #9\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z2.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
+ ".inst 0x658aa9d5 // bfcvt z21.h, p2/M, z14.s\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"subs x26, x26, #0x1\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
+ "ld1w { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0xc13812d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z8.h\n"
+ ".inst 0x658aa856 // bfcvt z22.h, p2/M, z2.s\n"
+ "ld1w { z7.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xc13b12f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z11.h\n"
+ ".inst 0x648aa9d6 // bfcvtnt z22.h, p2/M, z14.s\n"
+ "ld1w { z31.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc1acc9a8 // fclamp { z8.s-z11.s }, z13.s, z12.s\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa8f7 // bfcvt z23.h, p2/M, z7.s\n"
"add x8, x8, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
+ ".inst 0x648aabf7 // bfcvtnt z23.h, p2/M, z31.s\n"
+ "ld1w { z2.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "st1w { z24.s }, p1, [x15]\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ ".inst 0x648aa858 // bfcvtnt z24.h, p2/M, z2.s\n"
+ "st1w { z8.s }, p1, [x15]\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x658aa819 // bfcvt z25.h, p2/M, z0.s\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc13212b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+ "st1w { z9.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
+ ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "st1w { z26.s }, p1, [x10]\n"
+ ".inst 0xc13112b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z1.h\n"
+ "st1w { z10.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "st1w { z27.s }, p1, [x9]\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
+ ".inst 0xc13912d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z9.h\n"
+ "ld1w { z31.s }, p1/Z, [x17]\n"
+ ".inst 0x658aabf5 // bfcvt z21.h, p2/M, z31.s\n"
+ "st1w { z11.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0x648aabd5 // bfcvtnt z21.h, p2/M, z30.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa836 // bfcvtnt z22.h, p2/M, z1.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc13212f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13412f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z4.h\n"
+ ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa9d8 // bfcvt z24.h, p2/M, z14.s\n"
+ ".inst 0x658aabb9 // bfcvt z25.h, p2/M, z29.s\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aab97 // bfcvtnt z23.h, p2/M, z28.s\n"
+ ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa8b9 // bfcvtnt z25.h, p2/M, z5.s\n"
+ ".inst 0x658aa97a // bfcvt z26.h, p2/M, z11.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -603,282 +603,282 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z1.s }, p0/Z, [x17]\n"
+ ".inst 0x658aa837 // bfcvt z23.h, p2/M, z1.s\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ ".inst 0x648aabb7 // bfcvtnt z23.h, p2/M, z29.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
+ ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aaa99 // bfcvtnt z25.h, p2/M, z20.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z10.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa95a // bfcvt z26.h, p2/M, z10.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z8.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa91a // bfcvtnt z26.h, p2/M, z8.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13112f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aab9b // bfcvtnt z27.h, p2/M, z28.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
+ ".inst 0x658aa81c // bfcvt z28.h, p2/M, z0.s\n"
+ ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc1301330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z21.s }, p0/Z, [x17]\n"
+ ".inst 0x658aaab4 // bfcvt z20.h, p2/M, z21.s\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab74 // bfcvtnt z20.h, p2/M, z27.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aabb6 // bfcvt z22.h, p2/M, z29.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z8.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa917 // bfcvtnt z23.h, p2/M, z8.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab98 // bfcvt z24.h, p2/M, z28.s\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1311290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aa818 // bfcvtnt z24.h, p2/M, z0.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z1.s }, p0/Z, [x21]\n"
+ ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
+ ".inst 0xc13912b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13012d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p0/Z, [x17]\n"
+ ".inst 0x658aa8da // bfcvt z26.h, p2/M, z6.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
+ ".inst 0x648aabba // bfcvtnt z26.h, p2/M, z29.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa9db // bfcvtnt z27.h, p2/M, z14.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab1c // bfcvt z28.h, p2/M, z24.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa83c // bfcvtnt z28.h, p2/M, z1.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa87d // bfcvt z29.h, p2/M, z3.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa81d // bfcvtnt z29.h, p2/M, z0.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab1e // bfcvt z30.h, p2/M, z24.s\n"
"addvl x21, SP, #6\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1311350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z23.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaafe // bfcvtnt z30.h, p2/M, z23.s\n"
"addvl x20, SP, #12\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc1391370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
+ ".inst 0xc1301351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z0.h\n"
+ ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1311371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z1.h\n"
+ ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z22.s }, p0/Z, [x17]\n"
+ ".inst 0x658aaad5 // bfcvt z21.h, p2/M, z22.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa875 // bfcvtnt z21.h, p2/M, z3.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab36 // bfcvtnt z22.h, p2/M, z25.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab17 // bfcvt z23.h, p2/M, z24.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa817 // bfcvtnt z23.h, p2/M, z0.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z7.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa8f8 // bfcvt z24.h, p2/M, z7.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa8d9 // bfcvt z25.h, p2/M, z6.s\n"
"addvl x21, SP, #3\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13112b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa8d9 // bfcvtnt z25.h, p2/M, z6.s\n"
"addvl x20, SP, #9\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc13912d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0x658aa87a // bfcvt z26.h, p2/M, z3.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"17:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x17]\n"
+ ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab76 // bfcvt z22.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aab37 // bfcvtnt z23.h, p2/M, z25.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
"lsr x20, x7, #0x1\n"
@@ -889,323 +889,323 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"sub x16, x16, x24\n"
"cbz x24, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
"addvl x23, SP, #6\n"
"addvl x21, SP, #12\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x17]\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x22, SP, #3\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
"mov x12, #0x4\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab62 // bfcvt z2.h, p2/M, z27.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa9c1 // bfcvtnt z1.h, p2/M, z14.s\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa923 // bfcvt z3.h, p2/M, z9.s\n"
"addvl x21, SP, #9\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa924 // bfcvt z4.h, p2/M, z9.s\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aa9e2 // bfcvtnt z2.h, p2/M, z15.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
+ ".inst 0x648aab63 // bfcvtnt z3.h, p2/M, z27.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab04 // bfcvtnt z4.h, p2/M, z24.s\n"
+ ".inst 0x658aa925 // bfcvt z5.h, p2/M, z9.s\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0x648aabc5 // bfcvtnt z5.h, p2/M, z30.s\n"
+ ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
+ ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z0.s }, p0/Z, [x17]\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z6.h\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaba6 // bfcvt z6.h, p2/M, z29.s\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ ".inst 0xc13e1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z14.h\n"
"mov x12, #0x4\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ ".inst 0x658aa815 // bfcvt z21.h, p2/M, z0.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
+ ".inst 0x658aa936 // bfcvt z22.h, p2/M, z9.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1301070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z0.h\n"
"subs x24, x24, #0x1\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc1acc9b8 // fclamp { z24.s-z27.s }, z13.s, z12.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"st1w { z24.s }, p1, [x15]\n"
"mov x12, #0x8\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z25.s }, p1, [x14]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1301071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z0.h\n"
+ ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aa919 // bfcvt z25.h, p2/M, z8.s\n"
+ "ld1w { z5.s }, p0/Z, [x20]\n"
"add x15, x15, x13, LSL #2\n"
"add x14, x14, x11, LSL #2\n"
"st1w { z26.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z27.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- ".inst 0x648aaaaa // bfcvtnt z10.h, p2/M, z21.s\n"
- ".inst 0x648aaa8b // bfcvtnt z11.h, p2/M, z20.s\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0x648aa955 // bfcvtnt z21.h, p2/M, z10.s\n"
+ ".inst 0x648aabb6 // bfcvtnt z22.h, p2/M, z29.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
- ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aa9f7 // bfcvtnt z23.h, p2/M, z15.s\n"
+ ".inst 0x648aa9d8 // bfcvtnt z24.h, p2/M, z14.s\n"
+ ".inst 0x648aa899 // bfcvtnt z25.h, p2/M, z4.s\n"
+ ".inst 0x658aa8ba // bfcvt z26.h, p2/M, z5.s\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z5.s }, p0/Z, [x17]\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #3\n"
"addvl x20, SP, #9\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ "ld1w { z2.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
"mov x12, #0x4\n"
- "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x22]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+ ".inst 0x658aa8bb // bfcvt z27.h, p2/M, z5.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa85c // bfcvt z28.h, p2/M, z2.s\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aabbb // bfcvtnt z27.h, p2/M, z29.s\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa83d // bfcvt z29.h, p2/M, z1.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa83e // bfcvt z30.h, p2/M, z1.s\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ ".inst 0x648aaa9c // bfcvtnt z28.h, p2/M, z20.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
+ ".inst 0x648aa9dd // bfcvtnt z29.h, p2/M, z14.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0x648aabfe // bfcvtnt z30.h, p2/M, z31.s\n"
+ ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
+ ".inst 0xc1321370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z2.h\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "st1w { z24.s }, p1, [x15]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ ".inst 0xc13a1390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x658aab40 // bfcvt z0.h, p2/M, z26.s\n"
+ ".inst 0xc1321371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z2.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ ".inst 0xc13a1391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z10.h\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc13913b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z27.s }, p1, [x9]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xc13913b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
"20:" // Main loop skip tail
"cbz x7, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x17]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x17]\n"
+ ".inst 0x658aab3d // bfcvt z29.h, p2/M, z25.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab3e // bfcvt z30.h, p2/M, z25.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab1e // bfcvtnt z30.h, p2/M, z24.s\n"
"mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab00 // bfcvtnt z0.h, p2/M, z24.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab21 // bfcvtnt z1.h, p2/M, z25.s\n"
+ ".inst 0xc13313b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
"addvl x21, SP, #6\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13b13d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"addvl x20, SP, #12\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc13e13b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z14.h\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab22 // bfcvt z2.h, p2/M, z25.s\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "st1w { z24.s }, p1, [x15]\n"
+ ".inst 0xc13f13d1 // bfdot za.s[x8, 1], { z30.h-z1.h }, z15.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13e13b2 // bfdot za.s[x8, 2], { z29.h-z0.h }, z14.h\n"
+ ".inst 0xc13713f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z7.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13f13d2 // bfdot za.s[x8, 2], { z30.h-z1.h }, z15.h\n"
+ ".inst 0xc13413f1 // bfdot za.s[x8, 1], { z31.h-z2.h }, z4.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
+ ".inst 0xc13913f2 // bfdot za.s[x8, 2], { z31.h-z2.h }, z9.h\n"
"add x8, x8, #0x1\n"
- "st1w { z25.s }, p1, [x14]\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z27.s }, p1, [x9]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"21:" // Tail input: End
"cbz x16, 23f\n"
"22:" // Right padding loop
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "st1w { z24.s }, p1, [x15]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
+ "st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z27.s }, p1, [x9]\n"
+ "st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"bgt 22b\n"
"23:" // End
@@ -1213,12 +1213,12 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20, LSL #2\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
index be4f02fc30..de3eadac8a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 6c42c76683..845f376926 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -73,96 +73,96 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z24.h, p2/M, z24.h\n"
+ "neg z21.h, p2/M, z21.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z8.s, #0x0\n"
+ "mov z30.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z20.h, #0x0\n"
- "sub z27.h, z27.h, z21.h\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
"incw x22\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #3\n"
- "sub z23.h, z23.h, z21.h\n"
- "trn1 z0.h, z20.h, z27.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
"mov x20, x22\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
"addvl x21, SP, #12\n"
"incw x22\n"
"addvl x21, x21, #-4\n"
"mov x20, x22\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
"addvl x21, x21, #-4\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
"st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
"addvl x21, x21, #-4\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -182,21 +182,21 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "ldp x27, x26, [x25], #0x10\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +204,22 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +231,148 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "ld1sb { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "ld1sb { z25.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1sb { z2.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1sb { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
"ld1sb { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
+ "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
"ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1sb { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
"ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z14.h, z14.h, z24.h\n"
+ "add z7.h, z7.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1sb { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1sb { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "ld1sb { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1sb { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
"ld1sb { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
"ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
"ld1sb { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "add z13.h, z13.h, z24.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z15.h, z15.h, z24.h\n"
+ "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -384,118 +384,118 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
"addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
"addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"sub x13, x13, x23\n"
@@ -503,121 +503,121 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- "add z21.h, p0/M, z21.h, z24.h\n"
+ "ld1sb { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add z20.h, p0/M, z20.h, z24.h\n"
+ "ld1sb { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
"addvl x21, SP, #4\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
"subs x23, x23, #0x1\n"
"ld1sb { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1sb { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1sb { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
"st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
"st1b { z6.s }, p1, [x10]\n"
@@ -628,15 +628,15 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
index d14d662240..56fb127aa0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 03575aa799..1d0efc6bc1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -73,86 +73,86 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z5.h, p2/M, z5.h\n"
+ "neg z11.h, p2/M, z11.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z0.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z24.h, z24.h, z13.h\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
"incw x22\n"
- "mov z17.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z24.h, #0x0\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
"mov x20, x22\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z13.h\n"
+ "sub z2.h, z2.h, z16.h\n"
"addvl x21, SP, #6\n"
"ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
+ "sub z25.h, z25.h, z16.h\n"
"incw x22\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
"addvl x21, x21, #-2\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z24.h, z24.h, z13.h\n"
- "sub z25.h, z25.h, z13.h\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z20.h, z20.h, z16.h\n"
"addvl x21, x21, #-2\n"
- "st1h { z10.h }, p2, [x21]\n"
- "mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
"addvl x21, x21, #-2\n"
- "mov z2.d, z0.d\n"
- "mov z3.d, z0.d\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -172,18 +172,18 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x27, x26, [x25], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -191,24 +191,24 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +220,194 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ "ld1sb { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ "ld1sb { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x23, x20, x13, LT\n"
- "add z13.h, z13.h, z5.h\n"
+ "add z22.h, z22.h, z11.h\n"
"ld1sb { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "ld1sb { z13.s }, p1/Z, [x22]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z12.h, z12.h, z5.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1sb { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "ld1sb { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1sb { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x10]\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1sb { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z23.h, z23.h, z24.h\n"
"add x27, x27, x25\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z13.h, z13.h, z5.h\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "add z14.h, z14.h, z5.h\n"
- "add z15.h, z15.h, z5.h\n"
- "add z16.h, z16.h, z5.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -418,442 +418,442 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"addvl x20, SP, #4\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"addvl x20, SP, #2\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z3.d\n"
"csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x22\n"
"cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "ld1sb { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"addvl x20, SP, #2\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z2.d\n"
"ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1sb { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x27]\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
"addvl x20, SP, #4\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z1.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z2.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
index 6f3290fd3c..40fa718266 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index d366b3c8d5..bb68733a45 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -69,196 +69,196 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
"mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x4\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z25.h, p2/M, z25.h\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z6.s, #0x0\n"
+ "mov z18.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x22, x23\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z2.h, #0x0\n"
- "sub z18.h, z18.h, z12.h\n"
+ "mov x20, x23\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
"incw x23\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z17.h, z17.h, z12.h\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z16.h, z16.h, z12.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "trn1 z10.h, z16.h, z15.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "addvl x21, SP, #30\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"incw x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z2.h, z18.h\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
"incw x23\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z21.h, z21.h, z12.h\n"
- "mov x22, x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z21.h, z21.h, z12.h\n"
- "sub z16.h, z16.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "mov z7.d, z6.d\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "addvl x21, x21, #-6\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x17, #0x1\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
"orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x4\n"
+ "add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
@@ -271,56 +271,56 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x16, x6, x20, x16\n"
- ".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x3, x10, [x20], #0x10\n"
- ".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
- ".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x6, x5\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
@@ -328,338 +328,338 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z1.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z29.s }, p1/Z, [x21]\n"
+ "ld1sb { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z2.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1sb { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z11.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1sb { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1sb { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z2.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "ld1sb { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1sb { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "ld1sb { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1sb { z15.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "ld1sb { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1sb { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1sb { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "ld1sb { z0.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "ld1sb { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z6.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "ld1sb { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1sb { z1.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "ld1sb { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1sb { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x24]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ "ld1sb { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x1\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1sb { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
+ "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "cmp x17, x15\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z28.h, z28.h, z25.h\n"
+ "add z26.h, z26.h, z17.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "add z30.h, z30.h, z25.h\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #12\n"
- "ld1sb { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1sb { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "ld1sb { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
"ld1sb { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1sb { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- "add z27.h, z27.h, z25.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- "add z28.h, z28.h, z25.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z29.h, z29.h, z25.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -668,515 +668,515 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"addvl x20, SP, #24\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"addvl x23, SP, #6\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1sb { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- "sub x17, x17, #0x1\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col]\n"
"sub x15, x15, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x16]\n"
- "add z23.h, p0/M, z23.h, z25.h\n"
+ "ld1sb { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x24]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "ld1sb { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #6\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x22, SP, #12\n"
- "add z22.h, p0/M, z22.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1sb { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "add z21.h, p0/M, z21.h, z25.h\n"
+ "ld1sb { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z25.h\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z19.s }, p0/Z, [x24]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1sb { z18.s }, p0/Z, [x24]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1sb { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z17.s }, p0/Z, [x24]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z16.s }, p0/Z, [x24]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1sb { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z29.h, z19.h, z18.h\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x23, ALL, MUL #16\n"
- "incw x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x5\n"
- "whilelt p1.s, x5, x7\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1194,7 +1194,7 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
index e7a781d072..8bffc05e1f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 3e8510392f..3da0d14d74 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -73,156 +73,156 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0xb\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z9.h, p2/M, z9.h\n"
+ "neg z7.h, p2/M, z7.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z12.h, z12.h, z18.h\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
"incw x22\n"
- "mov z14.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z26.h, #0x0\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"addvl x21, SP, #15\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z27.h, z27.h, z28.h\n"
"incw x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
+ "sub z14.h, z14.h, z28.h\n"
"addvl x21, x21, #-3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
"incw x22\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
- "sub z25.h, z25.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
"st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "trn1 z29.h, z11.h, z26.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x22\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "sub z24.h, z24.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z17.h, z17.h, z18.h\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"addvl x21, x21, #-3\n"
- "st1h { z2.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z28.d\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
"addvl x21, x21, #-3\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x7, #0x1\n"
@@ -242,20 +242,20 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -263,24 +263,24 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
"sub x16, x16, x21\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -296,341 +296,341 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z27.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1sb { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
"9:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z29.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
"10:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z26.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1sb { z21.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x22]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x22]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "ld1sb { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1sb { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "ld1sb { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1sb { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z29.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1sb { z21.s }, p1/Z, [x22]\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "ld1sb { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1sb { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1sb { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1sb { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
"12:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
+ "trn1 z23.h, z23.h, z25.h\n"
"sub x16, x16, #0x1\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x7, #0x1\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z30.h\n"
"cmp x20, x16\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x26, x20, x16, LT\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"and x7, x7, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
"sub x16, x16, x26\n"
"cbz x26, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
"subs x26, x26, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
"add x17, x17, %x[ld_in_col]\n"
"add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z21.s }, p1/Z, [x23]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1sb { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x23]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z24.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "ld1sb { z20.s }, p1/Z, [x23]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1sb { z18.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x23]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z25.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1sb { z19.s }, p1/Z, [x23]\n"
+ "ld1sb { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x23]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "ld1sb { z18.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1sb { z28.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x23]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "ld1sb { z17.s }, p1/Z, [x23]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1sb { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1sb { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
"ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1sb { z11.s }, p1/Z, [x17]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x10]\n"
- "ld1sb { z12.s }, p1/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
"add x9, x9, x27\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
+ "ld1sb { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z13.h, z13.h, z9.h\n"
+ "add z25.h, z25.h, z7.h\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
"ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
@@ -645,688 +645,688 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1sb { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #6\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
"addvl x20, SP, #12\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #3\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
"addvl x20, SP, #9\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
"add x17, x17, %x[ld_in_col]\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"19:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
"lsr x20, x7, #0x1\n"
"cmp x20, x16\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
"csel x25, x20, x16, LT\n"
"add x17, x17, %x[ld_in_col]\n"
"and x7, x7, #0x1\n"
"sub x16, x16, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z27.h, z27.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
"add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1sb { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
"add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1sb { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x22]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x22]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1sb { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
"ld1sb { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1sb { z13.s }, p0/Z, [x22]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1sb { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x22]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x22]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x22]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1sb { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z25.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x9]\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"22:" // Main loop skip tail
"cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1sb { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1sb { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
"addvl x21, SP, #6\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
"addvl x20, SP, #12\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
"23:" // Tail input: End
"cbz x16, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 24b\n"
"25:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #16\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
index 875a9f8294..2e40c75d6b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index a7ef556840..60c3a1e632 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -73,96 +73,96 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z24.h, p2/M, z24.h\n"
+ "neg z21.h, p2/M, z21.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z8.s, #0x0\n"
+ "mov z30.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "ld1b { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z20.h, #0x0\n"
- "sub z27.h, z27.h, z21.h\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
"incw x22\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #3\n"
- "sub z23.h, z23.h, z21.h\n"
- "trn1 z0.h, z20.h, z27.h\n"
"ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
"mov x20, x22\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
"addvl x21, SP, #12\n"
"incw x22\n"
"addvl x21, x21, #-4\n"
"mov x20, x22\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1b { z4.s }, p2/Z, [x20]\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
"addvl x21, x21, #-4\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
"st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
"addvl x21, x21, #-4\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -182,21 +182,21 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "ldp x27, x26, [x25], #0x10\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +204,22 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +231,148 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z25.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
+ "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1b { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z14.h, z14.h, z24.h\n"
+ "add z7.h, z7.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
"ld1b { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "add z13.h, z13.h, z24.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z15.h, z15.h, z24.h\n"
+ "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -384,118 +384,118 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"sub x13, x13, x23\n"
@@ -503,121 +503,121 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- "add z21.h, p0/M, z21.h, z24.h\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add z20.h, p0/M, z20.h, z24.h\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
"addvl x21, SP, #4\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
"subs x23, x23, #0x1\n"
"ld1b { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1b { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
"st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
"st1b { z6.s }, p1, [x10]\n"
@@ -628,15 +628,15 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
index b878914ce8..f852e12de1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 630d870433..e4ce6c74fb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -73,86 +73,86 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z5.h, p2/M, z5.h\n"
+ "neg z11.h, p2/M, z11.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z0.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z24.h, z24.h, z13.h\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
"incw x22\n"
- "mov z17.h, #0x0\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "mov z24.h, #0x0\n"
+ "ld1b { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
"mov x20, x22\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z13.h\n"
+ "sub z2.h, z2.h, z16.h\n"
"addvl x21, SP, #6\n"
"ld1b { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
+ "sub z25.h, z25.h, z16.h\n"
"incw x22\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
"addvl x21, x21, #-2\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z24.h, z24.h, z13.h\n"
- "sub z25.h, z25.h, z13.h\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z20.h, z20.h, z16.h\n"
"addvl x21, x21, #-2\n"
- "st1h { z10.h }, p2, [x21]\n"
- "mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
"addvl x21, x21, #-2\n"
- "mov z2.d, z0.d\n"
- "mov z3.d, z0.d\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -172,18 +172,18 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x27, x26, [x25], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -191,24 +191,24 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +220,194 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x23, x20, x13, LT\n"
- "add z13.h, z13.h, z5.h\n"
+ "add z22.h, z22.h, z11.h\n"
"ld1b { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z12.h, z12.h, z5.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x10]\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z23.h, z23.h, z24.h\n"
"add x27, x27, x25\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z13.h, z13.h, z5.h\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "add z14.h, z14.h, z5.h\n"
- "add z15.h, z15.h, z5.h\n"
- "add z16.h, z16.h, z5.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -418,442 +418,442 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"addvl x20, SP, #4\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"addvl x20, SP, #2\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z3.d\n"
"csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x22\n"
"cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"addvl x20, SP, #2\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z2.d\n"
"ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x27]\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
"addvl x20, SP, #4\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z1.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z2.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
index db0750eb08..d8b87dcd55 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 2c19e232f8..d33ef764ef 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -69,196 +69,196 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
"mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x4\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z25.h, p2/M, z25.h\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z6.s, #0x0\n"
+ "mov z18.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x22, x23\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z2.h, #0x0\n"
- "sub z18.h, z18.h, z12.h\n"
+ "mov x20, x23\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
"incw x23\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z17.h, z17.h, z12.h\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z16.h, z16.h, z12.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "trn1 z10.h, z16.h, z15.h\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "addvl x21, SP, #30\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"incw x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z2.h, z18.h\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
"incw x23\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "sub z21.h, z21.h, z12.h\n"
- "mov x22, x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1b { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z21.h, z21.h, z12.h\n"
- "sub z16.h, z16.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z15.s }, p2/Z, [x22]\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "mov z7.d, z6.d\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "addvl x21, x21, #-6\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1b { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x17, #0x1\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
"orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x4\n"
+ "add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
@@ -271,56 +271,56 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x16, x6, x20, x16\n"
- ".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x3, x10, [x20], #0x10\n"
- ".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
- ".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x6, x5\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
@@ -328,338 +328,338 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z1.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x21]\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ "ld1b { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1b { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
+ "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "cmp x17, x15\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z28.h, z28.h, z25.h\n"
+ "add z26.h, z26.h, z17.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "add z30.h, z30.h, z25.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #12\n"
- "ld1b { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z22.s }, p1/Z, [x20]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
"ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- "add z27.h, z27.h, z25.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- "add z28.h, z28.h, z25.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z29.h, z29.h, z25.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -668,515 +668,515 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x20, SP, #24\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x23, SP, #6\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- "sub x17, x17, #0x1\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col]\n"
"sub x15, x15, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x16]\n"
- "add z23.h, p0/M, z23.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x24]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #6\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x22, SP, #12\n"
- "add z22.h, p0/M, z22.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1b { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "add z21.h, p0/M, z21.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z25.h\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p0/Z, [x24]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p0/Z, [x24]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p0/Z, [x24]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p0/Z, [x24]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z29.h, z19.h, z18.h\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x23, ALL, MUL #16\n"
- "incw x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x5\n"
- "whilelt p1.s, x5, x7\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1194,7 +1194,7 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
index 9fa295b20e..05aad19c09 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 468e6778a4..6c144afa77 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -73,156 +73,156 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0xb\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z9.h, p2/M, z9.h\n"
+ "neg z7.h, p2/M, z7.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z12.h, z12.h, z18.h\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
"incw x22\n"
- "mov z14.h, #0x0\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "mov z26.h, #0x0\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"addvl x21, SP, #15\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z27.h, z27.h, z28.h\n"
"incw x22\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
+ "sub z14.h, z14.h, z28.h\n"
"addvl x21, x21, #-3\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1b { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"ld1b { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
"incw x22\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1b { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
- "sub z25.h, z25.h, z18.h\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
"st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "trn1 z29.h, z11.h, z26.h\n"
"ld1b { z16.s }, p2/Z, [x20]\n"
"incw x22\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z12.s }, p2/Z, [x20]\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "sub z24.h, z24.h, z18.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
- "sub z17.h, z17.h, z18.h\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"addvl x21, x21, #-3\n"
- "st1h { z2.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z28.d\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
"addvl x21, x21, #-3\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x7, #0x1\n"
@@ -242,20 +242,20 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -263,24 +263,24 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
"sub x16, x16, x21\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -296,341 +296,341 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z27.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
"9:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
"10:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z26.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
"12:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
+ "trn1 z23.h, z23.h, z25.h\n"
"sub x16, x16, #0x1\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x7, #0x1\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z30.h\n"
"cmp x20, x16\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x26, x20, x16, LT\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"and x7, x7, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
"sub x16, x16, x26\n"
"cbz x26, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
"subs x26, x26, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"add x17, x17, %x[ld_in_col]\n"
"add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z21.s }, p1/Z, [x23]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x23]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z24.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "ld1b { z20.s }, p1/Z, [x23]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x23]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1b { z19.s }, p1/Z, [x23]\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x23]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "ld1b { z18.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x23]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
"ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x10]\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
"add x9, x9, x27\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z13.h, z13.h, z9.h\n"
+ "add z25.h, z25.h, z7.h\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
@@ -645,688 +645,688 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #6\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
"addvl x20, SP, #12\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #3\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
"addvl x20, SP, #9\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
"add x17, x17, %x[ld_in_col]\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"19:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
"lsr x20, x7, #0x1\n"
"cmp x20, x16\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
"csel x25, x20, x16, LT\n"
"add x17, x17, %x[ld_in_col]\n"
"and x7, x7, #0x1\n"
"sub x16, x16, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z27.h, z27.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
"add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
"add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x22]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x22]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
"ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x22]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1b { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x22]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x22]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x22]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1b { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z25.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x9]\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"22:" // Main loop skip tail
"cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
"addvl x21, SP, #6\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
"addvl x20, SP, #12\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
"23:" // Tail input: End
"cbz x16, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 24b\n"
"25:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #16\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
index de574fff9a..a4345097b5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 1636225b31..612beb342a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -73,96 +73,96 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z24.h, p2/M, z24.h\n"
+ "neg z21.h, p2/M, z21.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z8.s, #0x0\n"
+ "mov z30.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z20.h, #0x0\n"
- "sub z27.h, z27.h, z21.h\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
"incw x22\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #3\n"
- "sub z23.h, z23.h, z21.h\n"
- "trn1 z0.h, z20.h, z27.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
"mov x20, x22\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
"addvl x21, SP, #12\n"
"incw x22\n"
"addvl x21, x21, #-4\n"
"mov x20, x22\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z27.h, z27.h, z21.h\n"
- "sub z23.h, z23.h, z21.h\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
"addvl x21, x21, #-4\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z21.h\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
"st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z20.h, z27.h\n"
- "trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
"addvl x21, x21, #-4\n"
- "trn1 z2.h, z23.h, z16.h\n"
- "trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -182,21 +182,21 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "ldp x27, x26, [x25], #0x10\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +204,22 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +231,148 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z25.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
+ "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1b { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z14.h, z14.h, z24.h\n"
+ "add z7.h, z7.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
"ld1b { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
"ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "add z13.h, z13.h, z24.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z15.h, z15.h, z24.h\n"
+ "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -384,118 +384,118 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
"addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- "trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z19.h, z18.h\n"
- "trn1 z14.h, z17.h, z16.h\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
"csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"sub x13, x13, x23\n"
@@ -503,121 +503,121 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
- "add z21.h, p0/M, z21.h, z24.h\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add z20.h, p0/M, z20.h, z24.h\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
"addvl x21, SP, #4\n"
- "add z18.h, p0/M, z18.h, z24.h\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
"subs x23, x23, #0x1\n"
"ld1b { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z24.h\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "add z16.h, p0/M, z16.h, z24.h\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1b { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- "trn1 z14.h, z19.h, z18.h\n"
- "trn1 z15.h, z17.h, z16.h\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x27]\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x26]\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
- ".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
- ".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
"st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
"st1b { z6.s }, p1, [x10]\n"
@@ -628,15 +628,15 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
index e412216af3..104c11fc9d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 2848a015db..8ce04fb8c2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -73,86 +73,86 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z5.h, p2/M, z5.h\n"
+ "neg z11.h, p2/M, z11.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z0.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z24.h, z24.h, z13.h\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
"incw x22\n"
- "mov z17.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z24.h, #0x0\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
"mov x20, x22\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z13.h\n"
+ "sub z2.h, z2.h, z16.h\n"
"addvl x21, SP, #6\n"
"ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z13.h\n"
+ "sub z25.h, z25.h, z16.h\n"
"incw x22\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
"addvl x21, x21, #-2\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z24.h, z24.h, z13.h\n"
- "sub z25.h, z25.h, z13.h\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z13.h\n"
+ "sub z20.h, z20.h, z16.h\n"
"addvl x21, x21, #-2\n"
- "st1h { z10.h }, p2, [x21]\n"
- "mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
"addvl x21, x21, #-2\n"
- "mov z2.d, z0.d\n"
- "mov z3.d, z0.d\n"
- "trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x21]\n"
- "trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x15, #0x1\n"
@@ -172,18 +172,18 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x14, x7, x20, x14\n"
- ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
"mov x22, #0x2\n"
- "ldp x11, x10, [x25], #0x10\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x27, x26, [x25], #0x10\n"
+ "ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -191,24 +191,24 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +220,194 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x23, x20, x13, LT\n"
- "add z13.h, z13.h, z5.h\n"
+ "add z22.h, z22.h, z11.h\n"
"ld1b { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x23\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z12.h, z12.h, z5.h\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "ld1b { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x10]\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z23.h, z23.h, z24.h\n"
"add x27, x27, x25\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z12.h, z12.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z13.h, z13.h, z5.h\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "add z14.h, z14.h, z5.h\n"
- "add z15.h, z15.h, z5.h\n"
- "add z16.h, z16.h, z5.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -418,442 +418,442 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"addvl x20, SP, #4\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"addvl x20, SP, #2\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z3.d\n"
"csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
"sub x13, x13, x22\n"
"cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "ld1b { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"addvl x20, SP, #2\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z16.d, z16.d\n"
+ "mov z25.d, z2.d\n"
"ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
"add x8, x8, #0x1\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x27]\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z12.s }, p0/Z, [x14]\n"
- "add z12.h, p0/M, z12.h, z5.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
"add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z5.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z5.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z5.h\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z5.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z5.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z5.h\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z5.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
"addvl x20, SP, #4\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x10]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z18.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
- ".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x11]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z29.s }, p1, [x10]\n"
+ "st1b { z1.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z30.s }, p1, [x27]\n"
+ "st1b { z2.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z31.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x20\n"
- "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
index 6071197340..52173b8551 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 3e77c75ad7..64023eeaff 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -69,196 +69,196 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
"mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x4\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z25.h, p2/M, z25.h\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z6.s, #0x0\n"
+ "mov z18.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x22, x23\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z2.h, #0x0\n"
- "sub z18.h, z18.h, z12.h\n"
+ "mov x20, x23\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
"incw x23\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z17.h, z17.h, z12.h\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z16.h, z16.h, z12.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "trn1 z10.h, z16.h, z15.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "sub z21.h, z21.h, z12.h\n"
- "addvl x21, SP, #30\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"incw x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z15.h, z15.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "trn1 z0.h, z2.h, z18.h\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
"incw x23\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z21.h, z21.h, z12.h\n"
- "mov x22, x23\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "sub z18.h, z18.h, z12.h\n"
- "sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "addvl x21, x21, #-6\n"
- "sub z21.h, z21.h, z12.h\n"
- "sub z16.h, z16.h, z12.h\n"
- "mov x22, x23\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x22]\n"
- "incw x22, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z15.s }, p2/Z, [x22]\n"
- "sub z18.h, z18.h, z12.h\n"
- "addvl x21, x21, #-6\n"
- "sub z17.h, z17.h, z12.h\n"
- "sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z12.h\n"
- "sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "mov z7.d, z6.d\n"
- "trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z8.h, z18.h, z17.h\n"
- "trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "trn1 z5.h, z21.h, z16.h\n"
- "trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "addvl x21, x21, #-6\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x21]\n"
- "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x17, #0x1\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
"orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
"orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x4\n"
+ "add x21, x6, x5\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
@@ -271,56 +271,56 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x16, x6, x20, x16\n"
- ".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x25], #0x10\n"
- ".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x3, x10, [x20], #0x10\n"
- ".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x9, x28, [x25], #0x10\n"
- ".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
- ".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x6, x5\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
@@ -328,338 +328,338 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z1.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x21]\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
+ "ld1b { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x24]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ "ld1b { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1b { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
+ "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "cmp x17, x15\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z28.h, z28.h, z25.h\n"
+ "add z26.h, z26.h, z17.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "add z30.h, z30.h, z25.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #12\n"
- "ld1b { z23.s }, p1/Z, [x16]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z22.s }, p1/Z, [x20]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
"ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- "add z27.h, z27.h, z25.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- "add z28.h, z28.h, z25.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z29.h, z29.h, z25.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x17, x17, x22\n"
+ "sub x25, x25, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -668,515 +668,515 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x20, SP, #24\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x21, SP, #18\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
"addvl x20, SP, #24\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"addvl x23, SP, #6\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"addvl x22, SP, #12\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
"addvl x20, SP, #24\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x17, 22f\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x16]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z19.h, z18.h\n"
- "trn1 z28.h, z17.h, z16.h\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ "ld1b { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- "sub x17, x17, #0x1\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
"sub x15, x15, #0x1\n"
- "cmp x17, x15\n"
- "trn1 z29.h, z19.h, z18.h\n"
- "trn1 z30.h, z17.h, z16.h\n"
- "csel x25, x17, x15, LT\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
"add x16, x16, %x[ld_in_col]\n"
"sub x15, x15, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x16]\n"
- "add z23.h, p0/M, z23.h, z25.h\n"
+ "ld1b { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
"add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x24]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x23, SP, #6\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x22, SP, #12\n"
- "add z22.h, p0/M, z22.h, z25.h\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1b { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "add z21.h, p0/M, z21.h, z25.h\n"
+ "ld1b { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z25.h\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p0/Z, [x24]\n"
- "add z19.h, p0/M, z19.h, z25.h\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p0/Z, [x24]\n"
- "add z18.h, p0/M, z18.h, z25.h\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p0/Z, [x24]\n"
- "add z17.h, p0/M, z17.h, z25.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
"add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p0/Z, [x24]\n"
- "add z16.h, p0/M, z16.h, z25.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
- "trn1 z27.h, z23.h, z22.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z29.h, z19.h, z18.h\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1641789 // sdot za.s[x8, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xc16b17a8 // sdot za.s[x8, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a17a9 // sdot za.s[x8, 1], { z29.h-z30.h }, z10.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
- ".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
- ".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- ".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x14]\n"
- "add x14, x14, x3\n"
- "st1b { z14.s }, p1, [x13]\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z13.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z15.s }, p1, [x28]\n"
+ "st1b { z11.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x23, ALL, MUL #16\n"
- "incw x23, ALL, MUL #9\n"
- "str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x5\n"
- "whilelt p1.s, x5, x7\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x20\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
@@ -1194,7 +1194,7 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
index 6949e69e39..ad82070912 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@
* SOFTWARE.
*/
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 33bb4eb8ec..d8dc69127e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -73,156 +73,156 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"ptrue p2.b\n"
"mov x20, #0xb\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
"sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x5\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
"ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z9.h, p2/M, z9.h\n"
+ "neg z7.h, p2/M, z7.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z12.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
"ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
"mov x20, x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z12.h, z12.h, z18.h\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
"incw x22\n"
- "mov z14.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "mov z26.h, #0x0\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"addvl x21, SP, #15\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z27.h, z27.h, z28.h\n"
"incw x22\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
+ "sub z14.h, z14.h, z28.h\n"
"addvl x21, x21, #-3\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z17.h, z17.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"mov x20, x22\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
"ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
"incw x22\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z12.h, z12.h, z18.h\n"
- "sub z25.h, z25.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
"st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z0.h, z16.h, z14.h\n"
+ "trn1 z29.h, z11.h, z26.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x22\n"
- "sub z24.h, z24.h, z18.h\n"
- "sub z17.h, z17.h, z18.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
"addvl x21, x21, #-3\n"
"mov x20, x22\n"
- "st1h { z2.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z18.h\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "sub z12.h, z12.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z25.h, z25.h, z18.h\n"
- "sub z24.h, z24.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
- "sub z17.h, z17.h, z18.h\n"
- "sub z16.h, z16.h, z18.h\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"addvl x21, x21, #-3\n"
- "st1h { z2.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z28.d\n"
- "trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
"addvl x21, x21, #-3\n"
- "trn1 z10.h, z24.h, z17.h\n"
- "trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x21]\n"
- "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
"cbz x20, 3f\n"
- "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
"sub x20, x7, #0x1\n"
@@ -242,20 +242,20 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
"add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
"lsl x20, %x[ld_in_row], #0x0\n"
"msub x17, x4, x20, x17\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x25], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x10, x9, [x25], #0x10\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -263,24 +263,24 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
"sub x16, x16, x21\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -296,341 +296,341 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z27.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
"9:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
"10:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z26.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1b { z21.s }, p1/Z, [x22]\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x22]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1b { z20.s }, p1/Z, [x22]\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x22]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x22]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x22]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
"12:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"add x21, x17, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
+ "trn1 z23.h, z23.h, z25.h\n"
"sub x16, x16, #0x1\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x7, #0x1\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z30.h\n"
"cmp x20, x16\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"csel x26, x20, x16, LT\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"and x7, x7, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
"sub x16, x16, x26\n"
"cbz x26, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
"add x23, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
"subs x26, x26, #0x1\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
"add x17, x17, %x[ld_in_col]\n"
"add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z21.s }, p1/Z, [x23]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x23]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z24.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "ld1b { z20.s }, p1/Z, [x23]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x23]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1b { z19.s }, p1/Z, [x23]\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x23]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "ld1b { z18.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x23]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "ld1b { z17.s }, p1/Z, [x23]\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add z16.h, z16.h, z9.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
"ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x15]\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1b { z11.s }, p1/Z, [x17]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x10]\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
"add x9, x9, x27\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z13.h, z13.h, z9.h\n"
+ "add z25.h, z25.h, z7.h\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
@@ -645,688 +645,688 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #6\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
"addvl x20, SP, #12\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
"add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"addvl x21, SP, #3\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
"addvl x20, SP, #9\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
"add x17, x17, %x[ld_in_col]\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
"19:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"sub x7, x7, #0x2\n"
"sub x16, x16, #0x1\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
"lsr x20, x7, #0x1\n"
"cmp x20, x16\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
"csel x25, x20, x16, LT\n"
"add x17, x17, %x[ld_in_col]\n"
"and x7, x7, #0x1\n"
"sub x16, x16, x25\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z27.h, z27.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
"add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
"add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x22]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x22]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
"ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x22]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1b { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x22]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x22]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x22]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
"add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x22]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1b { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
"add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z25.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x10]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x9]\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"22:" // Main loop skip tail
"cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x17]\n"
- "add z11.h, p0/M, z11.h, z9.h\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
"add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
- "add z12.h, p0/M, z12.h, z9.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "trn1 z12.h, z12.h, z20.h\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
- "add z13.h, p0/M, z13.h, z9.h\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z9.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z9.h\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z9.h\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z9.h\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z9.h\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
"add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z9.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
"addvl x21, SP, #6\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "mov z16.d, z16.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
"addvl x20, SP, #12\n"
"sub x16, x16, #0x1\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x14]\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z19.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
"23:" // Tail input: End
"cbz x16, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
"add x15, x15, x13\n"
- "st1b { z5.s }, p1, [x14]\n"
+ "st1b { z29.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z30.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z7.s }, p1, [x9]\n"
+ "st1b { z31.s }, p1, [x9]\n"
"add x9, x9, x27\n"
"bgt 24b\n"
"25:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x22, ALL, MUL #16\n"
- "incw x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"incw x6\n"
"whilelt p1.s, x6, x5\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x17, x17, x20\n"
- "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
"ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
"ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
"ldp x23, x22, [x25, #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 1c1fb25e1f..edee21e941 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 9fd220abf8..d807856ccb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -108,10 +108,10 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"whilelt p2.h, XZR, %x[n_channels]\n"
"madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "ld1h { z18.h }, p3/Z, [x10]\n"
+ "ld1h { z27.h }, p3/Z, [x10]\n"
"add x27, x13, x13\n"
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
"ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
@@ -125,10 +125,10 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"add x28, x28, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x11, %x[n_channels]\n"
"add x23, x25, x23, LSL #1\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
"add x22, x28, x22, LSL #1\n"
"mov x21, #0x0\n"
@@ -142,175 +142,175 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
"whilelt p1.h, x11, %x[n_channels]\n"
"inch x21\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
"inch x11\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
"mov p0.b, p2.b\n"
- "ld1h { z18.h }, p3/Z, [x10]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x10]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
"inch x20\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
"addvl x9, x9, #1\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
"addvl x26, x26, #1\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
"ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25]\n"
"ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
"ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
"ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
"addvl x25, x25, #1\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
"ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
"ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
"ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
"cmp x11, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
"ld1h { z10.h }, p1/Z, [x9]\n"
"ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
"ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
- "st1h { z28.h }, p0, [x28]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
"addvl x23, x23, #1\n"
- "st1h { z29.h }, p0, [x28, x12, LSL #1]\n"
+ "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z22.h }, p0, [x22]\n"
"addvl x28, x28, #1\n"
"ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
"addvl x10, x10, #-6\n"
- "st1h { z31.h }, p0, [x22, x12, LSL #1]\n"
+ "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
"ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
"add x21, x10, #0x1\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
"csel x10, x10, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
"csel x14, x14, XZR, LT\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
"cmp x10, x20\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "st1h { z28.h }, p0, [x28]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z29.h }, p0, [x28, x12, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 9242b470c3..90982b6990 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -87,7 +87,7 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z18.h }, p3/Z, [x16]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
@@ -98,99 +98,99 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
"addvl x16, x16, #-6\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
- "ldr x23, [x15, #0x60]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
"whilelt p1.h, x14, %x[n_channels]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
"inch x28\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
"mov p0.b, p2.b\n"
- "ld1h { z18.h }, p3/Z, [x16]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
"inch x9\n"
- "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
"inch x14\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
"ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
"ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
@@ -199,98 +199,98 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"cmp x14, %x[n_channels]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
"ldr x23, [x15, #0x60]\n"
"ldr x22, [x15, #0x68]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
"ldr x21, [x15, #0x70]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
"ldr x20, [x15, #0x78]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
"inch x28\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index d49b14eeaf..da2ef72a30 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index d2dae84089..a22ab39d6f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x10, x23, LSL #1\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z18.h }, p3/Z, [x13]\n"
+ "ld1h { z14.h }, p3/Z, [x13]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
@@ -129,10 +129,10 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
"add x24, x11, x21, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x15, %x[n_channels]\n"
"add x23, x24, x21, LSL #1\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
"add x22, x16, x16\n"
"mov x21, #0x0\n"
@@ -146,131 +146,131 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
"whilelt p1.h, x15, %x[n_channels]\n"
"inch x21\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
"inch x15\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
"inch x20\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x25, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "ld1h { z18.h }, p3/Z, [x13]\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x27, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z14.h }, p3/Z, [x13]\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "fmla z22.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z23.h\n"
+ "fmla z21.h, p3/M, z1.h, z23.h\n"
+ "fmla z29.h, p3/M, z8.h, z23.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z25.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z17.h\n"
"addvl x10, x10, #1\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z21.h, p3/M, z7.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
"addvl x28, x28, #1\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
"addvl x14, x14, #1\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z19.h\n"
"ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x14]\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x9]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
"ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z27.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
"addvl x9, x9, #1\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
"ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x12, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "fmla z21.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
"addvl x26, x26, #1\n"
"ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
@@ -279,182 +279,182 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"cmp x15, %x[n_channels]\n"
"ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
"ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
"ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x26]\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
"ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
- "st1h { z23.h }, p0, [x11]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "st1h { z24.h }, p0, [x11, x16, LSL #1]\n"
+ "st1h { z28.h }, p0, [x11]\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "fmin z20.h, p3/M, z20.h, z30.h\n"
+ "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z25.h }, p0, [x11, x22, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
"addvl x11, x11, #1\n"
"ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
"st1h { z26.h }, p0, [x24]\n"
"addvl x13, x13, #-6\n"
- "st1h { z27.h }, p0, [x24, x16, LSL #1]\n"
- "st1h { z28.h }, p0, [x24, x22, LSL #1]\n"
+ "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
"addvl x24, x24, #1\n"
- "st1h { z29.h }, p0, [x23]\n"
- "st1h { z30.h }, p0, [x23, x16, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x8, x8, #0x1\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
"cmp x8, x20\n"
"add x21, x13, #0x1\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
"csel x13, x13, x21, LT\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
"mov p0.b, p2.b\n"
"csel x8, x8, XZR, LT\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
"cmp x13, x20\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x25, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x27, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x12, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z23.h }, p0, [x11]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "st1h { z24.h }, p0, [x11, x16, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z25.h }, p0, [x11, x22, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "fmla z22.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z23.h\n"
+ "fmla z21.h, p3/M, z1.h, z23.h\n"
+ "fmla z29.h, p3/M, z8.h, z23.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z25.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x9]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "fmla z20.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "st1h { z28.h }, p0, [x11]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "fmin z20.h, p3/M, z20.h, z30.h\n"
"st1h { z26.h }, p0, [x24]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z27.h }, p0, [x24, x16, LSL #1]\n"
- "st1h { z28.h }, p0, [x24, x22, LSL #1]\n"
- "st1h { z29.h }, p0, [x23]\n"
- "st1h { z30.h }, p0, [x23, x16, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 59c0e0cf0b..4f8368acd5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -88,390 +88,390 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1h { z18.h }, p3/Z, [x17]\n"
- "cnth x15\n"
- "mov x14, #0x0\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z14.h }, p3/Z, [x8]\n"
+ "cnth x16\n"
+ "mov x15, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [x16, #0x20]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x12, x14, LSL #1]\n"
- "addvl x17, x17, #-6\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "sub x14, XZR, x16\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
- "inch x13\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "inch x14\n"
"mov p1.b, p2.b\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "ldr x23, [x27, #0x0]\n"
- "whilelt p0.h, x15, %x[n_channels]\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "ldr x10, [x13, #0x0]\n"
+ "whilelt p0.h, x16, %x[n_channels]\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "ldr x22, [x27, #0x8]\n"
- "ldr x21, [x27, #0x10]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ldr x20, [x27, #0x18]\n"
- "ld1h { z18.h }, p3/Z, [x17]\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ldr x28, [x16, #0x20]\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
"fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z23.h }, p1, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ldr x23, [x27, #0x20]\n"
- "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "inch x14\n"
- "ld1h { z9.h }, p0/Z, [x12, x15, LSL #1]\n"
- "ld1h { z10.h }, p0/Z, [x11, x15, LSL #1]\n"
- "ld1h { z11.h }, p0/Z, [x10, x15, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x9, x15, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "ld1h { z13.h }, p0/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "ldr x9, [x13, #0x8]\n"
+ "ldr x28, [x13, #0x10]\n"
+ "fmla z21.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "ldr x27, [x13, #0x18]\n"
+ "ld1h { z14.h }, p3/Z, [x8]\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "fmla z22.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "fmla z27.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z19.h\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z1.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x17, #0x20]\n"
+ "fmla z22.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "st1h { z29.h }, p1, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z22.h, p3/M, z2.h, z17.h\n"
+ "ldr x24, [x13, #0x20]\n"
+ "st1h { z28.h }, p1, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
"inch x15\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "st1h { z25.h }, p1, [x21, x13, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z27.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "whilelt p2.h, x14, %x[n_channels]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "st1h { z28.h }, p1, [x22, x13, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1h { z29.h }, p1, [x21, x13, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1h { z30.h }, p1, [x20, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1h { z31.h }, p1, [x23, x13, LSL #1]\n"
+ "ld1h { z9.h }, p0/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p0/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x20, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "ld1h { z13.h }, p0/Z, [x25, x16, LSL #1]\n"
+ "inch x16\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "st1h { z27.h }, p1, [x28, x14, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmax z23.h, p3/M, z23.h, z31.h\n"
+ "st1h { z26.h }, p1, [x27, x14, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "st1h { z25.h }, p1, [x24, x14, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "st1h { z24.h }, p1, [x23, x14, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "st1h { z23.h }, p1, [x22, x14, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "st1h { z22.h }, p1, [x20, x14, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
- "inch x13\n"
- "mov p1.b, p2.b\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "inch x14\n"
+ "mov p0.b, p2.b\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "ldr x23, [x27, #0x0]\n"
- "ldr x22, [x27, #0x8]\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "ldr x10, [x13, #0x0]\n"
+ "ldr x9, [x13, #0x8]\n"
"fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "ldr x21, [x27, #0x10]\n"
- "ldr x20, [x27, #0x18]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmax z24.h, p3/M, z24.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
"fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z23.h }, p1, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ldr x23, [x27, #0x20]\n"
- "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z25.h, p3/M, z25.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmax z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z25.h }, p1, [x21, x13, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z17.h\n"
- "fmax z30.h, p3/M, z30.h, z17.h\n"
- "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z27.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "st1h { z28.h }, p1, [x22, x13, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z29.h }, p1, [x21, x13, LSL #1]\n"
- "st1h { z30.h }, p1, [x20, x13, LSL #1]\n"
- "st1h { z31.h }, p1, [x23, x13, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "ldr x28, [x13, #0x10]\n"
+ "ldr x27, [x13, #0x18]\n"
+ "fmla z21.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "fmla z22.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "fmla z27.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z19.h\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z1.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "st1h { z29.h }, p0, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z22.h, p3/M, z2.h, z17.h\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1h { z28.h }, p0, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "st1h { z27.h }, p0, [x28, x14, LSL #1]\n"
+ "fmax z23.h, p3/M, z23.h, z31.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "st1h { z26.h }, p0, [x27, x14, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "st1h { z24.h }, p0, [x23, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "st1h { z23.h }, p0, [x22, x14, LSL #1]\n"
+ "st1h { z21.h }, p0, [x21, x14, LSL #1]\n"
+ "st1h { z22.h }, p0, [x20, x14, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index ac6ae284fd..af5ee740c9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index c0b9137f6b..41eaa4f18c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
"add x8, x8, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
"add x13, x8, x23, LSL #1\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
+ "ld1h { z19.h }, p3/Z, [x17]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"add x12, x13, x23, LSL #1\n"
"add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
@@ -132,8 +132,8 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x5\n"
"add x26, x9, x22, LSL #1\n"
"add x25, x6, x6\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x16, %x[n_channels]\n"
"add x24, x28, x23, LSL #1\n"
"ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
@@ -149,500 +149,500 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"addvl x17, x17, #-6\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
"whilelt p1.h, x16, %x[n_channels]\n"
"inch x21\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z19\n fmla z21.h, p3/M, z3.h, z9.h\n"
+ "movprfx z22, z19\n fmla z22.h, p3/M, z1.h, z9.h\n"
"inch x16\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
"inch x20\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z13, z19\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "movprfx z17, z19\n fmla z17.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z19\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "movprfx z18, z19\n fmla z18.h, p3/M, z2.h, z9.h\n"
"ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z29.h }, p2/Z, [x24]\n"
"ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z21.h, p3/M, z4.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "movprfx z23, z19\n fmla z23.h, p3/M, z6.h, z29.h\n"
"ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "fmla z14.h, p3/M, z7.h, z9.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z6.h, z12.h\n"
+ "movprfx z26, z19\n fmla z26.h, p3/M, z3.h, z12.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z0.h, z12.h\n"
"ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z9.h\n"
"ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z19\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z19.h }, p3/Z, [x17]\n"
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "fmla z18.h, p3/M, z5.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
"ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
"ld1h { z11.h }, p2/Z, [x28]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z2.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
"ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z14.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z13.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z14.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z13.h, p3/M, z5.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z9.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z26.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z14.h, p3/M, z3.h, z11.h\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "fmla z22.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "fmla z13.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "fmla z21.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
"fmla z20.h, p3/M, z2.h, z10.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "fmla z14.h, p3/M, z6.h, z12.h\n"
"fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x10, LSL #1]\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z13.h, p3/M, z1.h, z9.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z27.h, p3/M, z3.h, z9.h\n"
+ "fmla z18.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z12.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z13.h, p3/M, z2.h, z11.h\n"
+ "fmla z17.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x11]\n"
+ "fmla z25.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
"fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x14, LSL #1]\n"
- "addvl x8, x8, #1\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
"fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x27, LSL #1]\n"
- "addvl x12, x12, #1\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
"addvl x11, x11, #1\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z12.h\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z8.h, z10.h\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
"fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
"fmla z28.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
"fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x5, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
"addvl x24, x24, #1\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x10, LSL #1]\n"
"addvl x13, x13, #1\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x5, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z13.h, p3/M, z13.h, z15.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z14.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z21.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmax z14.h, p3/M, z14.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmla z18.h, p3/M, z7.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z11.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmla z20.h, p3/M, z8.h, z0.h\n"
+ "fmla z28.h, p3/M, z7.h, z0.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z0.h\n"
+ "fmla z24.h, p3/M, z4.h, z0.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
"ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
"ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
"ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
"ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z13.h, p3/M, z13.h, z16.h\n"
+ "fmin z17.h, p3/M, z17.h, z16.h\n"
"ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
"addvl x17, x17, #16\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
"ld1h { z10.h }, p1/Z, [x8]\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z14.h, p3/M, z14.h, z16.h\n"
"ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "st1h { z16.h }, p0, [x15]\n"
+ "fmin z21.h, p3/M, z21.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z31.h }, p0, [x15]\n"
"ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "st1h { z17.h }, p0, [x15, x6, LSL #1]\n"
+ "fmin z18.h, p3/M, z18.h, z16.h\n"
+ "fmin z22.h, p3/M, z22.h, z16.h\n"
+ "st1h { z13.h }, p0, [x15, x6, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "st1h { z18.h }, p0, [x15, x25, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z19.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z20.h, p3/M, z20.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z17.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "st1h { z30.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z27.h }, p0, [x9]\n"
"addvl x28, x28, #1\n"
- "st1h { z20.h }, p0, [x9]\n"
+ "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
"addvl x15, x15, #1\n"
- "st1h { z21.h }, p0, [x9, x6, LSL #1]\n"
+ "st1h { z21.h }, p0, [x9, x25, LSL #1]\n"
"addvl x17, x17, #-6\n"
- "st1h { z22.h }, p0, [x9, x25, LSL #1]\n"
- "st1h { z23.h }, p0, [x9, x22, LSL #1]\n"
+ "st1h { z26.h }, p0, [x9, x22, LSL #1]\n"
"addvl x9, x9, #1\n"
- "st1h { z24.h }, p0, [x26]\n"
- "st1h { z25.h }, p0, [x26, x6, LSL #1]\n"
- "st1h { z26.h }, p0, [x26, x25, LSL #1]\n"
- "st1h { z27.h }, p0, [x26, x22, LSL #1]\n"
+ "st1h { z18.h }, p0, [x26]\n"
+ "st1h { z22.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z28.h }, p0, [x26, x22, LSL #1]\n"
"addvl x26, x26, #1\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x6, LSL #1]\n"
- "st1h { z30.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23]\n"
+ "st1h { z25.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z3.h, z9.h\n"
+ "movprfx z13, z19\n fmla z13.h, p3/M, z1.h, z9.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x4, x4, #0x1\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
"cmp x4, x20\n"
"add x21, x16, #0x1\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z18, z19\n fmla z18.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z6.h, z9.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"csel x16, x16, x21, LT\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z17, z19\n fmla z17.h, p3/M, z5.h, z9.h\n"
+ "movprfx z26, z19\n fmla z26.h, p3/M, z2.h, z9.h\n"
"ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
"mov p0.b, p2.b\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24]\n"
- "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "movprfx z27, z19\n fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z29.h }, p2/Z, [x24]\n"
+ "ld1h { z21.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z2.h, z12.h\n"
"csel x4, x4, XZR, LT\n"
"cmp x16, x20\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x5, LSL #1]\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x10, LSL #1]\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x8, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "movprfx z10, z19\n fmla z10.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z14.h, p3/M, z7.h, z9.h\n"
"fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x14, LSL #1]\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x27, LSL #1]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x27, LSL #1]\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x5, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x10, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x5, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
- "st1h { z16.h }, p0, [x15]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
- "st1h { z17.h }, p0, [x15, x6, LSL #1]\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "st1h { z18.h }, p0, [x15, x25, LSL #1]\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "st1h { z19.h }, p0, [x15, x22, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "st1h { z20.h }, p0, [x9]\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "st1h { z21.h }, p0, [x9, x6, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "st1h { z22.h }, p0, [x9, x25, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "st1h { z23.h }, p0, [x9, x22, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z24.h }, p0, [x26]\n"
- "st1h { z25.h }, p0, [x26, x6, LSL #1]\n"
- "st1h { z26.h }, p0, [x26, x25, LSL #1]\n"
- "st1h { z27.h }, p0, [x26, x22, LSL #1]\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x6, LSL #1]\n"
- "st1h { z30.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "movprfx z11, z19\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z25, z19\n fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z22.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z21.h\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z21.h }, p2/Z, [x8, x10, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "movprfx z12, z19\n fmla z12.h, p3/M, z1.h, z9.h\n"
+ "movprfx z23, z19\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "fmla z17.h, p3/M, z8.h, z9.h\n"
+ "fmla z26.h, p3/M, z5.h, z9.h\n"
+ "fmla z10.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z29.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z31.h, p3/M, z1.h, z22.h\n"
+ "fmla z18.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z21.h\n"
+ "fmla z27.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28]\n"
+ "fmla z30.h, p3/M, z7.h, z29.h\n"
+ "fmla z11.h, p3/M, z6.h, z29.h\n"
+ "fmla z13.h, p3/M, z5.h, z29.h\n"
+ "fmla z20.h, p3/M, z4.h, z29.h\n"
+ "fmla z25.h, p3/M, z3.h, z29.h\n"
+ "fmla z12.h, p3/M, z2.h, z29.h\n"
+ "fmla z23.h, p3/M, z1.h, z29.h\n"
+ "fmla z24.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "fmla z26.h, p3/M, z6.h, z19.h\n"
+ "fmla z10.h, p3/M, z3.h, z19.h\n"
+ "fmla z14.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z27.h, p3/M, z5.h, z22.h\n"
+ "fmla z11.h, p3/M, z2.h, z22.h\n"
+ "fmla z18.h, p3/M, z4.h, z21.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z21.h\n"
+ "fmla z14.h, p3/M, z2.h, z29.h\n"
+ "fmla z31.h, p3/M, z5.h, z21.h\n"
+ "fmla z18.h, p3/M, z5.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z29.h\n"
+ "fmla z27.h, p3/M, z3.h, z29.h\n"
+ "fmla z30.h, p3/M, z1.h, z29.h\n"
+ "fmla z11.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z10.h, p3/M, z7.h, z19.h\n"
+ "fmla z12.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z22.h\n"
+ "fmla z14.h, p3/M, z3.h, z22.h\n"
+ "fmla z26.h, p3/M, z1.h, z22.h\n"
+ "fmla z13.h, p3/M, z0.h, z22.h\n"
+ "fmla z31.h, p3/M, z7.h, z22.h\n"
+ "fmla z18.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z29.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z21.h\n"
+ "fmla z27.h, p3/M, z7.h, z21.h\n"
+ "fmla z30.h, p3/M, z5.h, z21.h\n"
+ "fmla z11.h, p3/M, z4.h, z21.h\n"
+ "fmla z20.h, p3/M, z2.h, z21.h\n"
+ "fmla z25.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z22.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "fmla z17.h, p3/M, z7.h, z19.h\n"
+ "fmla z14.h, p3/M, z6.h, z19.h\n"
+ "fmla z26.h, p3/M, z4.h, z19.h\n"
+ "fmla z13.h, p3/M, z3.h, z19.h\n"
+ "fmla z10.h, p3/M, z1.h, z19.h\n"
+ "fmla z12.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z29.h\n"
+ "fmla z18.h, p3/M, z1.h, z29.h\n"
+ "fmla z28.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x12]\n"
+ "fmla z23.h, p3/M, z2.h, z21.h\n"
+ "fmla z27.h, p3/M, z0.h, z22.h\n"
+ "fmla z17.h, p3/M, z3.h, z29.h\n"
+ "fmla z26.h, p3/M, z0.h, z29.h\n"
+ "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "fmla z11.h, p3/M, z7.h, z21.h\n"
+ "fmla z20.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z22.h\n"
+ "fmla z28.h, p3/M, z1.h, z22.h\n"
+ "ld1h { z21.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x11]\n"
+ "fmla z12.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z8.h, z21.h\n"
+ "fmla z11.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "fmla z17.h, p3/M, z6.h, z29.h\n"
+ "fmla z26.h, p3/M, z3.h, z29.h\n"
+ "fmla z10.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z9.h\n"
+ "fmla z12.h, p3/M, z7.h, z22.h\n"
+ "fmla z23.h, p3/M, z6.h, z22.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "fmla z13.h, p3/M, z7.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z19.h\n"
+ "fmla z10.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z9.h\n"
+ "fmla z12.h, p3/M, z5.h, z21.h\n"
+ "fmla z23.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z3.h, z21.h\n"
+ "fmla z11.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z10.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z13.h, p3/M, z8.h, z21.h\n"
+ "fmla z20.h, p3/M, z7.h, z21.h\n"
+ "fmla z25.h, p3/M, z6.h, z21.h\n"
+ "fmla z12.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z19.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z22.h\n"
+ "fmla z18.h, p3/M, z3.h, z22.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmla z17.h, p3/M, z1.h, z22.h\n"
+ "fmla z14.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z9.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmla z28.h, p3/M, z5.h, z29.h\n"
+ "fmla z27.h, p3/M, z4.h, z29.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmla z30.h, p3/M, z2.h, z29.h\n"
+ "fmla z11.h, p3/M, z1.h, z29.h\n"
+ "fmax z14.h, p3/M, z14.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z26.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z6.h, z21.h\n"
+ "fmax z11.h, p3/M, z11.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmla z10.h, p3/M, z4.h, z21.h\n"
+ "fmla z12.h, p3/M, z3.h, z21.h\n"
+ "fmax z13.h, p3/M, z13.h, z15.h\n"
+ "fmax z10.h, p3/M, z10.h, z15.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z25.h, p3/M, z7.h, z9.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "fmla z24.h, p3/M, z4.h, z9.h\n"
+ "fmax z12.h, p3/M, z12.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z31.h }, p0, [x15]\n"
+ "fmin z18.h, p3/M, z18.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z18.h }, p0, [x15, x6, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z17.h, p3/M, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z14.h, p3/M, z14.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z27.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z11.h, p3/M, z11.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z17.h }, p0, [x9]\n"
+ "fmin z13.h, p3/M, z13.h, z16.h\n"
+ "fmin z20.h, p3/M, z20.h, z16.h\n"
+ "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z10.h, p3/M, z10.h, z16.h\n"
+ "st1h { z30.h }, p0, [x9, x25, LSL #1]\n"
+ "fmin z12.h, p3/M, z12.h, z16.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "st1h { z11.h }, p0, [x9, x22, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z26.h }, p0, [x26]\n"
+ "st1h { z13.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z25.h }, p0, [x26, x22, LSL #1]\n"
+ "st1h { z10.h }, p0, [x23]\n"
+ "st1h { z12.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
@@ -653,4 +653,4 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 972b78b6d5..c0be293cd7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -99,616 +99,616 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
- "cnth x15\n"
- "mov x14, #0x0\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z17.h }, p3/Z, [x7]\n"
+ "cnth x17\n"
+ "mov x16, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "ld1h { z9.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "sub x15, XZR, x17\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "inch x13\n"
+ "movprfx z20, z17\n fmla z20.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z22, z17\n fmla z22.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z14, z17\n fmla z14.h, p3/M, z5.h, z9.h\n"
+ "movprfx z23, z17\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z25.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z28.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z22.h, p3/M, z8.h, z12.h\n"
+ "inch x15\n"
"mov p1.b, p2.b\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "whilelt p0.h, x15, %x[n_channels]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "movprfx z15, z17\n fmla z15.h, p3/M, z6.h, z28.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z20.h, p3/M, z7.h, z25.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z13, z17\n fmla z13.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z21.h\n"
+ "fmla z24.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.h, p3/M, z4.h, z25.h\n"
+ "fmla z31.h, p3/M, z3.h, z25.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z18, z17\n fmla z18.h, p3/M, z1.h, z25.h\n"
+ "movprfx z21, z17\n fmla z21.h, p3/M, z0.h, z25.h\n"
+ "whilelt p0.h, x17, %x[n_channels]\n"
+ "ld1h { z17.h }, p3/Z, [x7]\n"
+ "fmla z14.h, p3/M, z8.h, z25.h\n"
+ "fmla z23.h, p3/M, z5.h, z25.h\n"
+ "fmla z15.h, p3/M, z2.h, z25.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z22.h, p3/M, z0.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z20.h, p3/M, z8.h, z10.h\n"
+ "fmla z9.h, p3/M, z1.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z24.h, p3/M, z7.h, z10.h\n"
+ "fmla z11.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z13.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
"fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ldr x9, [x16, #0x118]\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
+ "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z26.h, p3/M, z3.h, z25.h\n"
+ "fmla z14.h, p3/M, z0.h, z25.h\n"
+ "fmla z23.h, p3/M, z6.h, z29.h\n"
+ "fmla z15.h, p3/M, z3.h, z29.h\n"
+ "ld1h { z25.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z22.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
"fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "st1h { z16.h }, p1, [x23, x13, LSL #1]\n"
- "st1h { z17.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
+ "fmla z9.h, p3/M, z5.h, z12.h\n"
+ "fmla z11.h, p3/M, z2.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z13.h, p3/M, z8.h, z25.h\n"
+ "fmla z28.h, p3/M, z5.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z26.h, p3/M, z5.h, z10.h\n"
+ "fmla z14.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z29.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z4.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "fmla z9.h, p3/M, z3.h, z12.h\n"
+ "fmla z24.h, p3/M, z1.h, z12.h\n"
+ "fmla z11.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z15.h, p3/M, z7.h, z25.h\n"
+ "fmla z18.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z26.h, p3/M, z7.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z29.h\n"
+ "fmla z14.h, p3/M, z4.h, z29.h\n"
+ "fmla z20.h, p3/M, z3.h, z29.h\n"
+ "fmla z23.h, p3/M, z1.h, z29.h\n"
+ "fmla z30.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z27.h, p3/M, z8.h, z10.h\n"
+ "fmla z21.h, p3/M, z8.h, z25.h\n"
+ "fmla z28.h, p3/M, z7.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z1.h, z10.h\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z5.h, z10.h\n"
+ "fmla z11.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z26.h, p3/M, z2.h, z29.h\n"
+ "fmla z22.h, p3/M, z1.h, z29.h\n"
+ "fmla z27.h, p3/M, z0.h, z29.h\n"
+ "fmla z14.h, p3/M, z7.h, z25.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z20.h, p3/M, z6.h, z25.h\n"
+ "fmla z23.h, p3/M, z4.h, z25.h\n"
+ "fmla z30.h, p3/M, z3.h, z25.h\n"
+ "fmla z15.h, p3/M, z1.h, z25.h\n"
+ "fmla z18.h, p3/M, z0.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z25.h\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z21.h, p3/M, z2.h, z25.h\n"
+ "fmla z22.h, p3/M, z2.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z9.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z26.h, p3/M, z6.h, z29.h\n"
+ "fmla z14.h, p3/M, z3.h, z29.h\n"
+ "fmla z23.h, p3/M, z0.h, z29.h\n"
+ "fmla z24.h, p3/M, z8.h, z25.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z5.h, z25.h\n"
+ "fmla z28.h, p3/M, z1.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z2.h, z12.h\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z15.h, p3/M, z0.h, z10.h\n"
+ "fmla z18.h, p3/M, z4.h, z25.h\n"
+ "fmla z21.h, p3/M, z3.h, z25.h\n"
+ "fmla z9.h, p3/M, z8.h, z12.h\n"
+ "fmla z11.h, p3/M, z5.h, z12.h\n"
+ "fmla z14.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z6.h, z25.h\n"
+ "fmla z15.h, p3/M, z5.h, z25.h\n"
+ "fmla z13.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z7.h, z29.h\n"
+ "fmla z21.h, p3/M, z6.h, z29.h\n"
+ "fmla z23.h, p3/M, z8.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z8.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z25.h\n"
+ "fmla z31.h, p3/M, z7.h, z25.h\n"
+ "fmla z13.h, p3/M, z6.h, z25.h\n"
+ "fmla z18.h, p3/M, z5.h, z25.h\n"
+ "fmla z21.h, p3/M, z4.h, z25.h\n"
+ "fmla z28.h, p3/M, z3.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldp x27, x26, [x8, #0x0]\n"
+ "fmla z11.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z29.h\n"
+ "fmax z26.h, p3/M, z26.h, z16.h\n"
+ "fmla z22.h, p3/M, z3.h, z29.h\n"
+ "fmla z27.h, p3/M, z5.h, z25.h\n"
+ "fmax z22.h, p3/M, z22.h, z16.h\n"
+ "fmax z27.h, p3/M, z27.h, z16.h\n"
+ "fmla z9.h, p3/M, z4.h, z25.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmax z9.h, p3/M, z9.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z19.h\n"
+ "fmla z21.h, p3/M, z7.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z19.h\n"
+ "fmla z14.h, p3/M, z1.h, z29.h\n"
+ "fmla z20.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z19.h\n"
+ "fmla z24.h, p3/M, z2.h, z25.h\n"
+ "fmla z11.h, p3/M, z1.h, z25.h\n"
+ "fmin z9.h, p3/M, z9.h, z19.h\n"
+ "fmax z14.h, p3/M, z14.h, z16.h\n"
+ "fmla z23.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmax z20.h, p3/M, z20.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z16.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "fmla z13.h, p3/M, z7.h, z12.h\n"
+ "fmax z11.h, p3/M, z11.h, z16.h\n"
+ "st1h { z26.h }, p1, [x12, x15, LSL #1]\n"
+ "st1h { z22.h }, p1, [x11, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z15.h, p3/M, z4.h, z10.h\n"
+ "st1h { z27.h }, p1, [x10, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "st1h { z9.h }, p1, [x9, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla z28.h, p3/M, z4.h, z12.h\n"
- "st1h { z18.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "st1h { z19.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
- "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x58]\n"
- "inch x14\n"
- "ld1h { z9.h }, p0/Z, [x12, x15, LSL #1]\n"
- "ld1h { z10.h }, p0/Z, [x11, x15, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "ld1h { z11.h }, p0/Z, [x10, x15, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x9, x15, LSL #1]\n"
- "inch x15\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x70]\n"
- "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x78]\n"
- "ld1h { z15.h }, p3/Z, [x17]\n"
- "whilelt p2.h, x14, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
+ "ldp x25, x24, [x8, #0x10]\n"
+ "fmin z14.h, p3/M, z14.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z14.h }, p1, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z19.h\n"
+ "fmin z11.h, p3/M, z11.h, z19.h\n"
+ "st1h { z20.h }, p1, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z23.h, p3/M, z23.h, z16.h\n"
+ "fmax z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z24.h }, p1, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z31.h, p3/M, z31.h, z16.h\n"
+ "fmax z13.h, p3/M, z13.h, z16.h\n"
+ "st1h { z11.h }, p1, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "inch x16\n"
+ "ld1h { z9.h }, p0/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x26, x17, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z19.h\n"
+ "ld1h { z11.h }, p0/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x24, x17, LSL #1]\n"
+ "inch x17\n"
+ "fmin z30.h, p3/M, z30.h, z19.h\n"
+ "fmin z31.h, p3/M, z31.h, z19.h\n"
+ "fmin z13.h, p3/M, z13.h, z19.h\n"
+ "st1h { z23.h }, p1, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmax z15.h, p3/M, z15.h, z16.h\n"
+ "fmax z18.h, p3/M, z18.h, z16.h\n"
+ "st1h { z30.h }, p1, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z21.h, p3/M, z21.h, z16.h\n"
+ "fmax z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z31.h }, p1, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1h { z13.h }, p1, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "whilelt p2.h, x16, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "fmin z18.h, p3/M, z18.h, z19.h\n"
+ "fmin z21.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "fmin z28.h, p3/M, z28.h, z19.h\n"
+ "st1h { z15.h }, p1, [x23, x15, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "st1h { z18.h }, p1, [x22, x15, LSL #1]\n"
+ "st1h { z21.h }, p1, [x21, x15, LSL #1]\n"
+ "st1h { z28.h }, p1, [x20, x15, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "inch x13\n"
- "mov p1.b, p2.b\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "movprfx z14, z17\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z18, z17\n fmla z18.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z15, z17\n fmla z15.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z20, z17\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "movprfx z13, z17\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z22, z17\n fmla z22.h, p3/M, z6.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z23.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x13, [x8, #0x70]\n"
"fmla z18.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z21.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z25.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "inch x15\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z6.h, z21.h\n"
+ "ld1h { z29.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z14.h, p3/M, z7.h, z23.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z10, z17\n fmla z10.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z25.h\n"
+ "fmla z15.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.h, p3/M, z4.h, z23.h\n"
+ "fmla z20.h, p3/M, z3.h, z23.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z25, z17\n fmla z25.h, p3/M, z1.h, z23.h\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z0.h, z23.h\n"
+ "fmla z27.h, p3/M, z8.h, z23.h\n"
+ "fmla z31.h, p3/M, z5.h, z23.h\n"
+ "fmla z28.h, p3/M, z2.h, z23.h\n"
"fmla z18.h, p3/M, z1.h, z12.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x14, LSL #1]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x14, LSL #1]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "ldr x9, [x16, #0x118]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmax z16.h, p3/M, z16.h, z14.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmax z17.h, p3/M, z17.h, z14.h\n"
- "fmax z18.h, p3/M, z18.h, z14.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmax z19.h, p3/M, z19.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
- "fmin z17.h, p3/M, z17.h, z13.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x14, LSL #1]\n"
- "fmin z18.h, p3/M, z18.h, z13.h\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmin z19.h, p3/M, z19.h, z13.h\n"
- "fmax z20.h, p3/M, z20.h, z14.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmax z21.h, p3/M, z21.h, z14.h\n"
- "fmax z22.h, p3/M, z22.h, z14.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmax z23.h, p3/M, z23.h, z14.h\n"
- "st1h { z16.h }, p1, [x23, x13, LSL #1]\n"
- "st1h { z17.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "st1h { z18.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "st1h { z19.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmin z20.h, p3/M, z20.h, z13.h\n"
- "fmin z21.h, p3/M, z21.h, z13.h\n"
- "fmin z22.h, p3/M, z22.h, z13.h\n"
- "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z23.h, p3/M, z23.h, z13.h\n"
- "fmax z24.h, p3/M, z24.h, z14.h\n"
- "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z25.h, p3/M, z25.h, z14.h\n"
- "fmax z26.h, p3/M, z26.h, z14.h\n"
- "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z27.h, p3/M, z27.h, z14.h\n"
- "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x58]\n"
- "fmin z24.h, p3/M, z24.h, z13.h\n"
- "fmin z25.h, p3/M, z25.h, z13.h\n"
- "fmin z26.h, p3/M, z26.h, z13.h\n"
- "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmin z27.h, p3/M, z27.h, z13.h\n"
- "fmax z28.h, p3/M, z28.h, z14.h\n"
- "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z29.h, p3/M, z29.h, z14.h\n"
- "fmax z30.h, p3/M, z30.h, z14.h\n"
- "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
- "ldr x21, [x28, #0x70]\n"
- "fmax z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
- "ldr x20, [x28, #0x78]\n"
- "fmin z28.h, p3/M, z28.h, z13.h\n"
- "fmin z29.h, p3/M, z29.h, z13.h\n"
- "fmin z30.h, p3/M, z30.h, z13.h\n"
- "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z13.h\n"
- "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
- "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
- "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z14.h, p3/M, z8.h, z29.h\n"
+ "fmla z9.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z15.h, p3/M, z7.h, z29.h\n"
+ "fmla z11.h, p3/M, z6.h, z29.h\n"
+ "fmla z30.h, p3/M, z5.h, z29.h\n"
+ "fmla z20.h, p3/M, z4.h, z29.h\n"
+ "fmla z10.h, p3/M, z3.h, z29.h\n"
+ "fmla z25.h, p3/M, z2.h, z29.h\n"
+ "fmla z24.h, p3/M, z1.h, z29.h\n"
+ "fmla z26.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z18.h, p3/M, z3.h, z23.h\n"
+ "fmla z27.h, p3/M, z0.h, z23.h\n"
+ "fmla z31.h, p3/M, z6.h, z21.h\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z13.h, p3/M, z4.h, z29.h\n"
+ "fmla z22.h, p3/M, z3.h, z29.h\n"
+ "fmla z14.h, p3/M, z1.h, z29.h\n"
+ "fmla z9.h, p3/M, z5.h, z12.h\n"
+ "fmla z11.h, p3/M, z2.h, z12.h\n"
+ "fmla z15.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z10.h, p3/M, z8.h, z21.h\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z18.h, p3/M, z5.h, z29.h\n"
+ "fmla z27.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z13.h, p3/M, z5.h, z17.h\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z14.h, p3/M, z2.h, z17.h\n"
+ "fmla z9.h, p3/M, z3.h, z17.h\n"
+ "fmla z15.h, p3/M, z1.h, z17.h\n"
+ "fmla z11.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z18.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z6.h, z21.h\n"
+ "fmla z27.h, p3/M, z4.h, z21.h\n"
+ "fmla z14.h, p3/M, z3.h, z21.h\n"
+ "fmla z31.h, p3/M, z1.h, z21.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z22.h, p3/M, z8.h, z29.h\n"
+ "fmla z24.h, p3/M, z8.h, z23.h\n"
+ "fmla z26.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z1.h, z29.h\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.h, p3/M, z7.h, z29.h\n"
+ "fmla z15.h, p3/M, z5.h, z29.h\n"
+ "fmla z11.h, p3/M, z4.h, z29.h\n"
+ "fmla z20.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z18.h, p3/M, z2.h, z21.h\n"
+ "fmla z13.h, p3/M, z1.h, z21.h\n"
+ "fmla z22.h, p3/M, z0.h, z21.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z14.h, p3/M, z6.h, z23.h\n"
+ "fmla z31.h, p3/M, z4.h, z23.h\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z23.h\n"
+ "fmla z25.h, p3/M, z0.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z4.h, z17.h\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z13.h, p3/M, z2.h, z29.h\n"
+ "fmla z22.h, p3/M, z1.h, z29.h\n"
+ "fmla z9.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z18.h, p3/M, z6.h, z21.h\n"
+ "fmla z27.h, p3/M, z3.h, z21.h\n"
+ "fmla z31.h, p3/M, z0.h, z21.h\n"
+ "fmla z15.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.h, p3/M, z7.h, z17.h\n"
+ "fmla z20.h, p3/M, z5.h, z17.h\n"
+ "fmla z26.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z2.h, z23.h\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z28.h, p3/M, z0.h, z29.h\n"
+ "fmla z25.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z3.h, z21.h\n"
+ "fmla z9.h, p3/M, z8.h, z23.h\n"
+ "fmla z11.h, p3/M, z5.h, z23.h\n"
+ "fmla z27.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z21.h\n"
+ "fmla z20.h, p3/M, z6.h, z21.h\n"
+ "fmla z28.h, p3/M, z5.h, z21.h\n"
+ "fmla z10.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z2.h, z23.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z31.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "fmla z20.h, p3/M, z7.h, z21.h\n"
+ "fmla z10.h, p3/M, z6.h, z21.h\n"
+ "fmla z25.h, p3/M, z5.h, z21.h\n"
+ "fmla z24.h, p3/M, z4.h, z21.h\n"
+ "fmla z26.h, p3/M, z3.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z11.h, p3/M, z8.h, z23.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmax z18.h, p3/M, z18.h, z16.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z21.h\n"
+ "fmax z13.h, p3/M, z13.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z16.h\n"
+ "fmla z9.h, p3/M, z4.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z29.h\n"
+ "fmax z9.h, p3/M, z9.h, z16.h\n"
+ "fmin z18.h, p3/M, z18.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z29.h\n"
+ "fmla z26.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmin z13.h, p3/M, z13.h, z19.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z14.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z19.h\n"
+ "fmla z15.h, p3/M, z2.h, z21.h\n"
+ "fmla z11.h, p3/M, z1.h, z21.h\n"
+ "fmin z9.h, p3/M, z9.h, z19.h\n"
+ "fmax z27.h, p3/M, z27.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z23.h\n"
+ "fmla z30.h, p3/M, z6.h, z23.h\n"
+ "fmax z14.h, p3/M, z14.h, z16.h\n"
+ "fmax z15.h, p3/M, z15.h, z16.h\n"
+ "fmla z20.h, p3/M, z8.h, z29.h\n"
+ "fmla z10.h, p3/M, z7.h, z29.h\n"
+ "fmax z11.h, p3/M, z11.h, z16.h\n"
+ "st1h { z18.h }, p0, [x12, x15, LSL #1]\n"
+ "st1h { z13.h }, p0, [x11, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z28.h, p3/M, z4.h, z23.h\n"
+ "st1h { z22.h }, p0, [x10, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z25.h, p3/M, z3.h, z23.h\n"
+ "fmla z24.h, p3/M, z5.h, z29.h\n"
+ "st1h { z9.h }, p0, [x9, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z26.h, p3/M, z4.h, z29.h\n"
+ "fmin z27.h, p3/M, z27.h, z19.h\n"
+ "fmin z14.h, p3/M, z14.h, z19.h\n"
+ "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "st1h { z27.h }, p0, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z11.h, p3/M, z11.h, z19.h\n"
+ "fmax z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z14.h }, p0, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z30.h, p3/M, z30.h, z16.h\n"
+ "fmax z20.h, p3/M, z20.h, z16.h\n"
+ "st1h { z15.h }, p0, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z10.h, p3/M, z10.h, z16.h\n"
+ "st1h { z11.h }, p0, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmin z31.h, p3/M, z31.h, z19.h\n"
+ "fmin z30.h, p3/M, z30.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z31.h }, p0, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmin z10.h, p3/M, z10.h, z19.h\n"
+ "fmax z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z30.h }, p0, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z25.h, p3/M, z25.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z20.h }, p0, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "fmax z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z10.h }, p0, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmin z28.h, p3/M, z28.h, z19.h\n"
+ "fmin z25.h, p3/M, z25.h, z19.h\n"
+ "fmin z24.h, p3/M, z24.h, z19.h\n"
+ "st1h { z28.h }, p0, [x23, x15, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z19.h\n"
+ "st1h { z25.h }, p0, [x22, x15, LSL #1]\n"
+ "st1h { z24.h }, p0, [x21, x15, LSL #1]\n"
+ "st1h { z26.h }, p0, [x20, x15, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 82173ee71f..d8a25666bd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 6a9b354c02..58decdba1c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -112,7 +112,7 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x28, x12, x23, LSL #1\n"
"madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z19.h }, p3/Z, [x11]\n"
+ "ld1h { z30.h }, p3/Z, [x11]\n"
"ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
@@ -128,8 +128,8 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x24, x26, x15\n"
"add x9, x9, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"cmp x13, %x[n_channels]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x23, x25, x23, LSL #1\n"
"add x22, x9, x21, LSL #1\n"
"ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
@@ -147,191 +147,191 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
"whilelt p1.h, x13, %x[n_channels]\n"
"inch x21\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
"inch x13\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x26, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ld1h { z14.h }, p2/Z, [x25]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z15.h }, p2/Z, [x27]\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
"addvl x12, x12, #1\n"
"addvl x28, x28, #1\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x26, LSL #1]\n"
- "ld1h { z19.h }, p3/Z, [x11]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z14.h }, p2/Z, [x25, x24, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x11]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z25.h\n"
+ "fmla z21.h, p3/M, z1.h, z24.h\n"
"ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
"inch x20\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z25.h\n"
+ "fmla z22.h, p3/M, z1.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x23]\n"
"addvl x27, x27, #1\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
"ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
"ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "fmla z22.h, p3/M, z7.h, z20.h\n"
+ "fmla z21.h, p3/M, z7.h, z18.h\n"
"ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z26.h, p3/M, z7.h, z24.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
"ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z29.h\n"
+ "fmax z21.h, p3/M, z21.h, z29.h\n"
"ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #16\n"
"whilelt p2.h, x21, %x[n_channels]\n"
"ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
"cmp x13, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
"ld1h { z10.h }, p1/Z, [x12]\n"
"ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z22.h, p3/M, z22.h, z28.h\n"
"ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
"ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z21.h, p3/M, z21.h, z28.h\n"
"addvl x25, x25, #1\n"
"ld1h { z14.h }, p1/Z, [x28]\n"
"ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
"addvl x23, x23, #1\n"
"ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
- "st1h { z28.h }, p0, [x9]\n"
+ "st1h { z27.h }, p0, [x9]\n"
"ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
- "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
+ "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
"addvl x9, x9, #1\n"
"ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
"addvl x11, x11, #-6\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x14, LSL #1]\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x26, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ld1h { z14.h }, p2/Z, [x25]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25]\n"
"add x16, x16, #0x1\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z15.h }, p2/Z, [x27]\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
"cmp x16, x20\n"
"add x21, x11, #0x1\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z14.h }, p2/Z, [x25, x24, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z25.h\n"
+ "fmla z21.h, p3/M, z1.h, z24.h\n"
"csel x11, x11, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z25.h\n"
+ "fmla z22.h, p3/M, z1.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x23]\n"
"csel x16, x16, XZR, LT\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
"ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
"cmp x11, x20\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z28.h }, p0, [x9]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z20.h\n"
+ "fmla z21.h, p3/M, z7.h, z18.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "st1h { z27.h }, p0, [x9]\n"
+ "fmla z26.h, p3/M, z7.h, z24.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z29.h\n"
+ "fmax z21.h, p3/M, z21.h, z29.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z22.h, p3/M, z22.h, z28.h\n"
+ "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z28.h\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index ff97b51e28..d5fbb6baee 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -96,7 +96,7 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z19.h }, p3/Z, [x16]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
@@ -111,8 +111,8 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
@@ -126,89 +126,89 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1h { z15.h }, p2/Z, [x23, x9, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z1.h, z20.h\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
"ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x24, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "ld1h { z15.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
"whilelt p1.h, x14, %x[n_channels]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"inch x9\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
"ldp x21, x20, [x15, #0x30]\n"
"ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
"ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
"ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
"inch x28\n"
"ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
"ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
@@ -216,122 +216,122 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"whilelt p2.h, x9, %x[n_channels]\n"
"ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
"ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
"ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
"inch x14\n"
- "ld1h { z19.h }, p3/Z, [x16]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
"cmp x14, %x[n_channels]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
"ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
"ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1h { z15.h }, p2/Z, [x23, x9, LSL #1]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x26, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z1.h, z20.h\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
"ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x24, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
- "ld1h { z15.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x21, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "ld1h { z15.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
"inch x28\n"
"mov p0.b, p2.b\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index f5d4189a47..abdfac5a3f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
-void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
-void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
@@ -57,7 +57,7 @@ class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index e6bfea1790..fdbee67926 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -113,14 +113,14 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x11, x23, LSL #1\n"
"add x28, x15, x17\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"whilelt p2.h, XZR, %x[n_channels]\n"
"add x27, x9, x23, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x26, x28, x17\n"
"add x25, x27, x23, LSL #1\n"
- "ld1h { z16.h }, p3/Z, [x10]\n"
+ "ld1h { z29.h }, p3/Z, [x10]\n"
"ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
"add x24, x26, x17\n"
"add x13, x13, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
@@ -146,378 +146,378 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z14.h }, p2/Z, [x9]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z24.h }, p2/Z, [x11, x28, LSL #1]\n"
"whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x10]\n"
+ "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z7.h\n"
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x10]\n"
"inch x21\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z23.h }, p2/Z, [x11, x26, LSL #1]\n"
"inch x12\n"
- "fmla z30.h, p3/M, z1.h, z8.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z26.h, p3/M, z1.h, z8.h\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z22.h }, p3/Z, [x10, #1, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
"addvl x14, x14, #1\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z30.h, p3/M, z2.h, z24.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
"addvl x11, x11, #1\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, x17, LSL #1]\n"
"inch x20\n"
- "fmla z30.h, p3/M, z3.h, z5.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ld1h { z9.h }, p2/Z, [x9, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z6.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z7.h\n"
- "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "fmla z26.h, p3/M, z3.h, z24.h\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z5.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z23.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z27.h, p3/M, z18.h, z7.h\n"
+ "fmla z31.h, p3/M, z18.h, z8.h\n"
"ld1h { z7.h }, p1/Z, [x11]\n"
- "fmla z30.h, p3/M, z0.h, z14.h\n"
- "fmla z31.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z8.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p2/Z, [x9, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z14.h\n"
+ "fmla z30.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.h, p3/M, z22.h, z8.h\n"
+ "fmla z31.h, p3/M, z22.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z22.h, z0.h\n"
+ "fmla z30.h, p3/M, z22.h, z19.h\n"
+ "ld1h { z8.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z13.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z2.h }, p2/Z, [x9, x26, LSL #1]\n"
"addvl x9, x9, #1\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z26.h, p3/M, z20.h, z19.h\n"
+ "fmla z30.h, p3/M, z20.h, z5.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.h, p3/M, z3.h, z5.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x27]\n"
- "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z6.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x27, x17, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z5.h\n"
- "fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z6.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z17.h, z24.h\n"
+ "fmla z31.h, p3/M, z17.h, z23.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z29.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z26.h, p3/M, z17.h, z5.h\n"
+ "fmla z30.h, p3/M, z17.h, z2.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z23.h\n"
+ "fmla z31.h, p3/M, z21.h, z10.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z2.h\n"
+ "fmla z30.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z27.h, p3/M, z18.h, z14.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z25.h\n"
+ "fmla z30.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z23.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z27.h, p3/M, z8.h, z0.h\n"
+ "fmla z31.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z24.h\n"
+ "fmla z30.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z27.h, p3/M, z16.h, z19.h\n"
+ "fmla z31.h, p3/M, z16.h, z5.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x26, LSL #1]\n"
"addvl x27, x27, #1\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z16.h, z22.h\n"
+ "fmla z30.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z27.h, p3/M, z17.h, z5.h\n"
+ "fmla z31.h, p3/M, z17.h, z2.h\n"
+ "ld1h { z16.h }, p2/Z, [x25]\n"
+ "fmla z26.h, p3/M, z17.h, z0.h\n"
+ "fmla z30.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z2.h\n"
+ "fmla z31.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z4.h }, p2/Z, [x25, x17, LSL #1]\n"
"ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z5.h\n"
- "fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z31.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x25, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x10]\n"
- "fmla z28.h, p3/M, z2.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z19.h\n"
+ "fmla z30.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z13.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z27.h, p3/M, z23.h, z25.h\n"
+ "fmla z31.h, p3/M, z23.h, z24.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z16.h\n"
+ "fmla z30.h, p3/M, z23.h, z4.h\n"
+ "ld1h { z5.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z20.h, z4.h\n"
+ "fmla z30.h, p3/M, z20.h, z25.h\n"
+ "ld1h { z23.h }, p3/Z, [x10]\n"
+ "fmla z27.h, p3/M, z18.h, z22.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
"addvl x25, x25, #1\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z31.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z31.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z25.h\n"
+ "fmla z30.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z27.h, p3/M, z17.h, z0.h\n"
+ "fmla z31.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "fmla z26.h, p3/M, z17.h, z24.h\n"
+ "fmla z30.h, p3/M, z17.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z27.h, p3/M, z13.h, z19.h\n"
+ "fmla z31.h, p3/M, z13.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
"ld1h { z14.h }, p1/Z, [x9]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z13.h, z8.h\n"
+ "fmla z30.h, p3/M, z13.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z5.h, z4.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z18.h\n"
+ "fmla z30.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
"ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z5.h\n"
+ "fmla z27.h, p3/M, z23.h, z4.h\n"
+ "fmla z31.h, p3/M, z23.h, z25.h\n"
"ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z17.h\n"
+ "fmla z30.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
"ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z6.h\n"
+ "fmla z27.h, p3/M, z21.h, z25.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
"ld1h { z5.h }, p1/Z, [x14]\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z16.h\n"
+ "fmla z30.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
"ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z8.h\n"
+ "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z8.h\n"
"addvl x10, x10, #16\n"
"whilelt p2.h, x21, %x[n_channels]\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z20.h, z18.h\n"
+ "fmla z30.h, p3/M, z20.h, z17.h\n"
"cmp x12, %x[n_channels]\n"
"addvl x23, x23, #1\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z9.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "fmla z27.h, p3/M, z19.h, z8.h\n"
+ "fmla z31.h, p3/M, z19.h, z22.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z26.h, p3/M, z19.h, z17.h\n"
+ "fmla z30.h, p3/M, z19.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "fmin z31.h, p3/M, z31.h, z28.h\n"
"ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
"ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
"ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
"ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
"ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
- "st1h { z28.h }, p0, [x13]\n"
- "st1h { z29.h }, p0, [x13, x16, LSL #1]\n"
+ "st1h { z27.h }, p0, [x13]\n"
+ "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
"addvl x13, x13, #1\n"
"ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
"ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z26.h }, p0, [x22]\n"
"addvl x10, x10, #-6\n"
- "st1h { z31.h }, p0, [x22, x16, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22, x16, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x11, x28, LSL #1]\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x10]\n"
+ "movprfx z5, z29\n fmla z5.h, p3/M, z0.h, z7.h\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x10]\n"
"ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
"ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z30.h, p3/M, z1.h, z8.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z5.h, p3/M, z1.h, z8.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #1, MUL VL]\n"
"add x8, x8, #0x1\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
"cmp x8, x20\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z5.h, p3/M, z2.h, z13.h\n"
+ "fmla z29.h, p3/M, z2.h, z22.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #2, MUL VL]\n"
"add x21, x12, #0x1\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x9, x17, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.h, p3/M, z3.h, z5.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z5.h, p3/M, z3.h, z22.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
"csel x12, x12, x21, LT\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ld1h { z9.h }, p2/Z, [x9, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z6.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z4.h, z6.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z0.h, z7.h\n"
- "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "fmla z30.h, p3/M, z20.h, z7.h\n"
+ "fmla z31.h, p3/M, z20.h, z8.h\n"
"csel x8, x8, XZR, LT\n"
"cmp x12, x20\n"
- "fmla z30.h, p3/M, z0.h, z14.h\n"
- "fmla z31.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z8.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p2/Z, [x9, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z5.h, p3/M, z20.h, z14.h\n"
+ "fmla z29.h, p3/M, z20.h, z1.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z30.h, p3/M, z19.h, z8.h\n"
+ "fmla z31.h, p3/M, z19.h, z13.h\n"
+ "ld1h { z26.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z19.h, z1.h\n"
+ "fmla z29.h, p3/M, z19.h, z0.h\n"
+ "ld1h { z25.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z18.h, z13.h\n"
+ "fmla z31.h, p3/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z18.h, z0.h\n"
+ "fmla z29.h, p3/M, z18.h, z27.h\n"
+ "ld1h { z23.h }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.h, p3/M, z3.h, z5.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x27]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z6.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x27, x17, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z14.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z5.h\n"
- "fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z6.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z9.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x17, LSL #1]\n"
- "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z5.h\n"
- "fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z31.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p2/Z, [x25, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x10]\n"
- "fmla z28.h, p3/M, z2.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z31.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z31.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x28, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z5.h\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z6.h\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z8.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z9.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x13]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x13, x16, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z17.h, z22.h\n"
+ "fmla z31.h, p3/M, z17.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x27]\n"
+ "fmla z5.h, p3/M, z17.h, z27.h\n"
+ "fmla z29.h, p3/M, z17.h, z24.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z6.h\n"
+ "fmla z31.h, p3/M, z16.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z24.h\n"
+ "fmla z29.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z14.h\n"
+ "fmla z31.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z21.h, z22.h\n"
+ "fmla z29.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z30.h, p3/M, z25.h, z1.h\n"
+ "fmla z31.h, p3/M, z25.h, z0.h\n"
+ "ld1h { z7.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z25.h, z19.h\n"
+ "fmla z29.h, p3/M, z25.h, z18.h\n"
+ "ld1h { z10.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z30.h, p3/M, z23.h, z0.h\n"
+ "fmla z31.h, p3/M, z23.h, z27.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z23.h, z18.h\n"
+ "fmla z29.h, p3/M, z23.h, z7.h\n"
+ "ld1h { z6.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z30.h, p3/M, z20.h, z27.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z0.h }, p2/Z, [x25]\n"
+ "fmla z5.h, p3/M, z20.h, z7.h\n"
+ "fmla z29.h, p3/M, z20.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z24.h\n"
+ "fmla z31.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z3.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z11.h\n"
+ "fmla z29.h, p3/M, z16.h, z17.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z22.h\n"
+ "fmla z31.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z26.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z21.h, z0.h\n"
+ "fmla z29.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z25.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z30.h, p3/M, z10.h, z19.h\n"
+ "fmla z31.h, p3/M, z10.h, z18.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z10.h, z3.h\n"
+ "fmla z29.h, p3/M, z10.h, z26.h\n"
+ "ld1h { z23.h }, p3/Z, [x10]\n"
+ "fmla z30.h, p3/M, z6.h, z18.h\n"
+ "fmla z31.h, p3/M, z6.h, z7.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z6.h, z26.h\n"
+ "fmla z29.h, p3/M, z6.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z9.h, z7.h\n"
+ "fmla z31.h, p3/M, z9.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "fmla z5.h, p3/M, z9.h, z24.h\n"
+ "fmla z29.h, p3/M, z9.h, z27.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z11.h\n"
+ "fmla z31.h, p3/M, z16.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z27.h\n"
+ "fmla z29.h, p3/M, z16.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z30.h, p3/M, z25.h, z0.h\n"
+ "fmla z31.h, p3/M, z25.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z25.h, z18.h\n"
+ "fmla z29.h, p3/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z23.h, z3.h\n"
+ "fmla z31.h, p3/M, z23.h, z26.h\n"
+ "fmla z5.h, p3/M, z23.h, z17.h\n"
+ "fmla z29.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z21.h, z26.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
+ "fmla z5.h, p3/M, z21.h, z16.h\n"
+ "fmla z29.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z27.h\n"
+ "fmla z5.h, p3/M, z20.h, z18.h\n"
+ "fmla z29.h, p3/M, z20.h, z17.h\n"
+ "fmla z30.h, p3/M, z19.h, z27.h\n"
+ "fmla z31.h, p3/M, z19.h, z22.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z5.h, p3/M, z19.h, z17.h\n"
+ "fmla z29.h, p3/M, z19.h, z16.h\n"
+ "fmax z5.h, p3/M, z5.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
+ "fmin z31.h, p3/M, z31.h, z28.h\n"
+ "st1h { z30.h }, p0, [x13]\n"
+ "fmin z5.h, p3/M, z5.h, z28.h\n"
+ "fmin z29.h, p3/M, z29.h, z28.h\n"
+ "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+ "st1h { z5.h }, p0, [x22]\n"
+ "st1h { z29.h }, p0, [x22, x16, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 2e20b524d8..1ec0cb2cbf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -104,448 +104,448 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"mov x13, #0x0\n"
"ldp x12, x11, [x20, #0x10]\n"
"whilelt p3.h, XZR, %x[n_channels]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "cnth x28\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "cnth x10\n"
"ptrue p2.b\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_params]]\n"
- "ld1h { z5.h }, p3/Z, [x10, x13, LSL #1]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1h { z6.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "sub x24, XZR, x28\n"
- "ldp x23, x22, [x16, #0x20]\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z17.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z16.h }, p2/Z, [x27]\n"
- "ld1h { z0.h }, p2/Z, [x27, #1, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z2.h }, p2/Z, [x27, #3, MUL VL]\n"
- "ld1h { z3.h }, p2/Z, [x27, #4, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x27, #5, MUL VL]\n"
- "ld1h { z7.h }, p3/Z, [x26, x13, LSL #1]\n"
- "addvl x27, x27, #6\n"
- "ld1h { z8.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z13.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ld1h { z14.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1h { z5.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "sub x28, XZR, x10\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z29.h }, p2/Z, [x9]\n"
+ "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1h { z7.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "addvl x9, x9, #6\n"
+ "ld1h { z8.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z16\n fmla z28.h, p2/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p2/M, z0.h, z6.h\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "movprfx z30, z16\n fmla z30.h, p2/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.h, p2/M, z1.h, z8.h\n"
- "fmla z31.h, p2/M, z1.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.h, p2/M, z2.h, z9.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.h, p2/M, z2.h, z13.h\n"
- "fmla z31.h, p2/M, z2.h, z5.h\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.h, p2/M, z3.h, z5.h\n"
- "fmla z31.h, p2/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z6.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.h, p2/M, z0.h, z7.h\n"
- "fmla z29.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.h, p2/M, z0.h, z14.h\n"
- "fmla z31.h, p2/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.h, p2/M, z1.h, z8.h\n"
- "fmla z29.h, p2/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.h, p2/M, z1.h, z11.h\n"
- "fmla z31.h, p2/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.h, p2/M, z2.h, z13.h\n"
- "fmla z29.h, p2/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.h, p2/M, z2.h, z12.h\n"
- "fmla z31.h, p2/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.h, p2/M, z3.h, z5.h\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.h, p2/M, z3.h, z9.h\n"
- "fmla z31.h, p2/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.h, p2/M, z4.h, z6.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z13.h\n"
- "fmla z31.h, p2/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.h, p2/M, z0.h, z14.h\n"
- "fmla z29.h, p2/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.h, p2/M, z0.h, z5.h\n"
- "fmla z31.h, p2/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.h, p2/M, z1.h, z11.h\n"
- "fmla z29.h, p2/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+ "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z6.h\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1h { z5.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
+ "movprfx z26, z29\n fmla z26.h, p2/M, z0.h, z8.h\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #-5, MUL VL]\n"
- "whilelt p1.h, x28, %x[n_channels]\n"
- "fmla z28.h, p2/M, z2.h, z12.h\n"
- "fmla z29.h, p2/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.h, p2/M, z2.h, z10.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #-4, MUL VL]\n"
- "inch x24\n"
- "fmla z28.h, p2/M, z3.h, z9.h\n"
- "fmla z29.h, p2/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-3, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.h, p2/M, z4.h, z13.h\n"
- "fmla z29.h, p2/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.h, p2/M, z0.h, z5.h\n"
- "fmla z29.h, p2/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.h, p2/M, z0.h, z9.h\n"
- "fmla z31.h, p2/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-1, MUL VL]\n"
- "ld1h { z16.h }, p2/Z, [x27, #4, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.h, p2/M, z1.h, z13.h\n"
- "fmla z31.h, p2/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p2/Z, [x27]\n"
- "fmla z28.h, p2/M, z2.h, z10.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z2.h, z5.h\n"
- "fmla z31.h, p2/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z3.h, z6.h\n"
- "fmla z31.h, p2/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z8.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z9.h\n"
- "fmla z29.h, p2/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z0.h, z11.h\n"
- "fmla z31.h, p2/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "fmla z28.h, p2/M, z1.h, z13.h\n"
- "fmla z29.h, p2/M, z1.h, z5.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
- "fmla z30.h, p2/M, z1.h, z12.h\n"
- "fmla z31.h, p2/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
- "fmla z28.h, p2/M, z2.h, z5.h\n"
- "fmla z29.h, p2/M, z2.h, z6.h\n"
- "ld1h { z5.h }, p1/Z, [x10, x28, LSL #1]\n"
+ "fmla z27.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z31.h, p2/M, z1.h, z8.h\n"
+ "fmla z26.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z21.h }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.h, p2/M, z2.h, z9.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "fmla z28.h, p2/M, z3.h, z6.h\n"
- "fmla z29.h, p2/M, z3.h, z8.h\n"
- "ld1h { z6.h }, p1/Z, [x9, x28, LSL #1]\n"
- "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z27.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z20.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z31.h, p2/M, z2.h, z13.h\n"
+ "fmla z26.h, p2/M, z2.h, z5.h\n"
+ "ldr x22, [x16, #0x78]\n"
+ "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "fmla z28.h, p2/M, z4.h, z8.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "inch x13\n"
- "ld1h { z7.h }, p1/Z, [x26, x28, LSL #1]\n"
+ "fmla z27.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z31.h, p2/M, z3.h, z5.h\n"
+ "fmla z26.h, p2/M, z3.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z9.h\n"
- "ld1h { z8.h }, p1/Z, [x25, x28, LSL #1]\n"
- "ld1h { z9.h }, p1/Z, [x23, x28, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x22, x28, LSL #1]\n"
- "ld1h { z11.h }, p1/Z, [x21, x28, LSL #1]\n"
- "fmax z28.h, p2/M, z28.h, z18.h\n"
- "fmax z29.h, p2/M, z29.h, z18.h\n"
- "ld1h { z12.h }, p1/Z, [x20, x28, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x10, x28, LSL #1]\n"
- "fmax z30.h, p2/M, z30.h, z18.h\n"
- "fmax z31.h, p2/M, z31.h, z18.h\n"
- "ld1h { z14.h }, p1/Z, [x9, x28, LSL #1]\n"
+ "fmla z27.h, p2/M, z4.h, z20.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z4.h, z22.h\n"
+ "fmla z26.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x23, [x16, #0x90]\n"
+ "fmla z30.h, p2/M, z21.h, z7.h\n"
+ "fmla z27.h, p2/M, z21.h, z8.h\n"
+ "ldr x26, [x16, #0x98]\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla z31.h, p2/M, z21.h, z14.h\n"
+ "fmla z26.h, p2/M, z21.h, z11.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.h, p2/M, z18.h, z8.h\n"
+ "fmla z27.h, p2/M, z18.h, z13.h\n"
+ "ld1h { z24.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z31.h, p2/M, z18.h, z11.h\n"
+ "fmla z26.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z17.h, z13.h\n"
+ "fmla z27.h, p2/M, z17.h, z5.h\n"
+ "ld1h { z3.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z31.h, p2/M, z17.h, z0.h\n"
+ "fmla z26.h, p2/M, z17.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.h, p2/M, z16.h, z5.h\n"
+ "fmla z27.h, p2/M, z16.h, z22.h\n"
+ "ld1h { z6.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x27, [x16, #0xc8]\n"
+ "fmla z31.h, p2/M, z16.h, z29.h\n"
+ "fmla z26.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x23, [x16, #0xd0]\n"
+ "fmla z30.h, p2/M, z19.h, z22.h\n"
+ "fmla z27.h, p2/M, z19.h, z10.h\n"
+ "ld1h { z23.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z3.h\n"
+ "fmla z26.h, p2/M, z19.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x22, [x16, #0xd8]\n"
+ "fmla z30.h, p2/M, z25.h, z14.h\n"
+ "fmla z27.h, p2/M, z25.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z31.h, p2/M, z25.h, z6.h\n"
+ "fmla z26.h, p2/M, z25.h, z23.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.h, p2/M, z18.h, z11.h\n"
+ "fmla z27.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z7.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z31.h, p2/M, z18.h, z23.h\n"
+ "fmla z26.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, #-5, MUL VL]\n"
+ "whilelt p1.h, x10, %x[n_channels]\n"
+ "fmla z30.h, p2/M, z17.h, z0.h\n"
+ "fmla z27.h, p2/M, z17.h, z29.h\n"
+ "ld1h { z19.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z31.h, p2/M, z17.h, z22.h\n"
+ "fmla z26.h, p2/M, z17.h, z7.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #-4, MUL VL]\n"
"inch x28\n"
- "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
+ "fmla z30.h, p2/M, z16.h, z29.h\n"
+ "fmla z27.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z0.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x100]\n"
+ "fmla z31.h, p2/M, z16.h, z7.h\n"
+ "fmla z26.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.h, p2/M, z21.h, z3.h\n"
+ "fmla z27.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z11.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z21.h, z19.h\n"
+ "fmla z26.h, p2/M, z21.h, z1.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla z30.h, p2/M, z20.h, z6.h\n"
+ "fmla z27.h, p2/M, z20.h, z23.h\n"
+ "ld1h { z25.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla z31.h, p2/M, z20.h, z0.h\n"
+ "fmla z26.h, p2/M, z20.h, z11.h\n"
+ "ld1h { z8.h }, p2/Z, [x9, #-1, MUL VL]\n"
+ "ld1h { z29.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "fmla z30.h, p2/M, z18.h, z23.h\n"
+ "fmla z27.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x118]\n"
+ "fmla z31.h, p2/M, z18.h, z11.h\n"
+ "fmla z26.h, p2/M, z18.h, z25.h\n"
+ "ld1h { z23.h }, p2/Z, [x9]\n"
+ "fmla z30.h, p2/M, z17.h, z22.h\n"
+ "fmla z27.h, p2/M, z17.h, z7.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z17.h, z25.h\n"
+ "fmla z26.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z7.h\n"
+ "fmla z27.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z16.h, z24.h\n"
+ "fmla z26.h, p2/M, z16.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z10.h, z19.h\n"
+ "fmla z27.h, p2/M, z10.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z10.h, z13.h\n"
+ "fmla z26.h, p2/M, z10.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.h, p2/M, z8.h, z0.h\n"
+ "fmla z27.h, p2/M, z8.h, z11.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z8.h, z18.h\n"
+ "fmla z26.h, p2/M, z8.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "fmla z30.h, p2/M, z23.h, z11.h\n"
+ "fmla z27.h, p2/M, z23.h, z25.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "fmla z31.h, p2/M, z23.h, z17.h\n"
+ "fmla z26.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "fmla z30.h, p2/M, z21.h, z25.h\n"
+ "fmla z27.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z5.h }, p1/Z, [x21, x10, LSL #1]\n"
+ "fmla z31.h, p2/M, z21.h, z16.h\n"
+ "fmla z26.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z27.h, p2/M, z20.h, z13.h\n"
+ "ld1h { z6.h }, p1/Z, [x20, x10, LSL #1]\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "fmla z31.h, p2/M, z20.h, z18.h\n"
+ "fmla z26.h, p2/M, z20.h, z17.h\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "fmla z30.h, p2/M, z19.h, z13.h\n"
+ "fmla z27.h, p2/M, z19.h, z22.h\n"
+ "inch x13\n"
+ "ld1h { z7.h }, p1/Z, [x27, x10, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z17.h\n"
+ "fmla z26.h, p2/M, z19.h, z16.h\n"
+ "ld1h { z8.h }, p1/Z, [x26, x10, LSL #1]\n"
+ "ld1h { z9.h }, p1/Z, [x25, x10, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x24, x10, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x23, x10, LSL #1]\n"
+ "fmax z30.h, p2/M, z30.h, z15.h\n"
+ "fmax z27.h, p2/M, z27.h, z15.h\n"
+ "ld1h { z12.h }, p1/Z, [x22, x10, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x21, x10, LSL #1]\n"
+ "fmax z31.h, p2/M, z31.h, z15.h\n"
+ "fmax z26.h, p2/M, z26.h, z15.h\n"
+ "ld1h { z14.h }, p1/Z, [x20, x10, LSL #1]\n"
+ "inch x10\n"
+ "ld1h { z2.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"whilelt p3.h, x13, %x[n_channels]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
- "fmin z28.h, p2/M, z28.h, z17.h\n"
- "fmin z29.h, p2/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x15, x24, LSL #1]\n"
- "fmin z30.h, p2/M, z30.h, z17.h\n"
- "fmin z31.h, p2/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x14, x24, LSL #1]\n"
- "st1h { z30.h }, p0, [x12, x24, LSL #1]\n"
- "addvl x27, x27, #-6\n"
- "st1h { z31.h }, p0, [x11, x24, LSL #1]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1h { z3.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "fmin z30.h, p2/M, z30.h, z28.h\n"
+ "fmin z27.h, p2/M, z27.h, z28.h\n"
+ "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+ "fmin z31.h, p2/M, z31.h, z28.h\n"
+ "fmin z26.h, p2/M, z26.h, z28.h\n"
+ "st1h { z27.h }, p0, [x14, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x12, x28, LSL #1]\n"
+ "addvl x9, x9, #-6\n"
+ "st1h { z26.h }, p0, [x11, x28, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z16\n fmla z28.h, p2/M, z0.h, z5.h\n"
- "movprfx z29, z16\n fmla z29.h, p2/M, z0.h, z6.h\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "movprfx z30, z16\n fmla z30.h, p2/M, z0.h, z7.h\n"
- "movprfx z31, z16\n fmla z31.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.h, p2/M, z1.h, z8.h\n"
- "fmla z31.h, p2/M, z1.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.h, p2/M, z2.h, z9.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.h, p2/M, z2.h, z13.h\n"
- "fmla z31.h, p2/M, z2.h, z5.h\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.h, p2/M, z3.h, z5.h\n"
- "fmla z31.h, p2/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z6.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.h, p2/M, z0.h, z7.h\n"
+ "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z7.h\n"
"fmla z29.h, p2/M, z0.h, z8.h\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.h, p2/M, z0.h, z14.h\n"
- "fmla z31.h, p2/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.h, p2/M, z1.h, z8.h\n"
- "fmla z29.h, p2/M, z1.h, z13.h\n"
- "ld1h { z8.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.h, p2/M, z1.h, z11.h\n"
- "fmla z31.h, p2/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.h, p2/M, z2.h, z13.h\n"
- "fmla z29.h, p2/M, z2.h, z5.h\n"
- "ld1h { z13.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.h, p2/M, z2.h, z12.h\n"
- "fmla z31.h, p2/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.h, p2/M, z3.h, z5.h\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.h, p2/M, z3.h, z9.h\n"
- "fmla z31.h, p2/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.h, p2/M, z4.h, z6.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z13.h\n"
- "fmla z31.h, p2/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.h, p2/M, z0.h, z14.h\n"
- "fmla z29.h, p2/M, z0.h, z11.h\n"
- "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.h, p2/M, z0.h, z5.h\n"
- "fmla z31.h, p2/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.h, p2/M, z1.h, z11.h\n"
- "fmla z29.h, p2/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p2/Z, [x27, #-5, MUL VL]\n"
- "inch x24\n"
- "fmla z28.h, p2/M, z2.h, z12.h\n"
- "fmla z29.h, p2/M, z2.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.h, p2/M, z2.h, z10.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #-4, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.h, p2/M, z3.h, z9.h\n"
- "fmla z29.h, p2/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #-3, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z13.h\n"
- "fmla z29.h, p2/M, z4.h, z8.h\n"
- "ld1h { z13.h }, p3/Z, [x9, x13, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.h, p2/M, z0.h, z5.h\n"
- "fmla z29.h, p2/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.h, p2/M, z0.h, z9.h\n"
- "fmla z31.h, p2/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x27, #-1, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "fmla z29.h, p2/M, z1.h, z10.h\n"
- "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.h, p2/M, z1.h, z13.h\n"
- "fmla z31.h, p2/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p2/Z, [x27]\n"
- "fmla z28.h, p2/M, z2.h, z10.h\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z2.h, z5.h\n"
- "fmla z31.h, p2/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z3.h, z6.h\n"
- "fmla z31.h, p2/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "fmla z29.h, p2/M, z4.h, z14.h\n"
- "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z8.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z9.h\n"
- "fmla z29.h, p2/M, z0.h, z13.h\n"
- "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z0.h, z11.h\n"
- "fmla z31.h, p2/M, z0.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x9, x13, LSL #1]\n"
- "fmla z28.h, p2/M, z1.h, z13.h\n"
- "fmla z29.h, p2/M, z1.h, z5.h\n"
- "fmla z30.h, p2/M, z1.h, z12.h\n"
"fmla z31.h, p2/M, z1.h, z9.h\n"
- "ld1h { z12.h }, p3/Z, [x26, x13, LSL #1]\n"
- "fmla z28.h, p2/M, z2.h, z5.h\n"
- "fmla z29.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z5.h, p2/M, z1.h, z8.h\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.h, p2/M, z2.h, z9.h\n"
"fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p2/M, z3.h, z6.h\n"
- "fmla z29.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z5.h, p2/M, z2.h, z13.h\n"
+ "fmla z29.h, p2/M, z2.h, z22.h\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.h, p2/M, z3.h, z11.h\n"
"fmla z31.h, p2/M, z3.h, z12.h\n"
- "fmla z28.h, p2/M, z4.h, z8.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "fmax z28.h, p2/M, z28.h, z18.h\n"
- "fmax z29.h, p2/M, z29.h, z18.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "fmla z5.h, p2/M, z3.h, z22.h\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z9.h\n"
- "fmax z30.h, p2/M, z30.h, z18.h\n"
- "fmax z31.h, p2/M, z31.h, z18.h\n"
- "fmin z28.h, p2/M, z28.h, z17.h\n"
- "fmin z29.h, p2/M, z29.h, z17.h\n"
- "st1h { z28.h }, p0, [x15, x24, LSL #1]\n"
- "fmin z30.h, p2/M, z30.h, z17.h\n"
- "fmin z31.h, p2/M, z31.h, z17.h\n"
- "st1h { z29.h }, p0, [x14, x24, LSL #1]\n"
- "st1h { z30.h }, p0, [x12, x24, LSL #1]\n"
- "st1h { z31.h }, p0, [x11, x24, LSL #1]\n"
+ "fmla z31.h, p2/M, z4.h, z16.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z4.h, z6.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.h, p2/M, z20.h, z7.h\n"
+ "fmla z31.h, p2/M, z20.h, z8.h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla z5.h, p2/M, z20.h, z14.h\n"
+ "fmla z29.h, p2/M, z20.h, z1.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.h, p2/M, z19.h, z8.h\n"
+ "fmla z31.h, p2/M, z19.h, z13.h\n"
+ "ld1h { z26.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z5.h, p2/M, z19.h, z1.h\n"
+ "fmla z29.h, p2/M, z19.h, z0.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z18.h, z13.h\n"
+ "fmla z31.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x23, [x16, #0xc0]\n"
+ "fmla z5.h, p2/M, z18.h, z0.h\n"
+ "fmla z29.h, p2/M, z18.h, z27.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.h, p2/M, z17.h, z22.h\n"
+ "fmla z31.h, p2/M, z17.h, z6.h\n"
+ "ld1h { z22.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x22, [x16, #0xc8]\n"
+ "fmla z5.h, p2/M, z17.h, z27.h\n"
+ "fmla z29.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla z30.h, p2/M, z16.h, z6.h\n"
+ "fmla z31.h, p2/M, z16.h, z10.h\n"
+ "ld1h { z19.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z24.h\n"
+ "fmla z29.h, p2/M, z16.h, z26.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla z30.h, p2/M, z21.h, z14.h\n"
+ "fmla z31.h, p2/M, z21.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z5.h, p2/M, z21.h, z22.h\n"
+ "fmla z29.h, p2/M, z21.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.h, p2/M, z25.h, z1.h\n"
+ "fmla z31.h, p2/M, z25.h, z0.h\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.h, p2/M, z25.h, z19.h\n"
+ "fmla z29.h, p2/M, z25.h, z18.h\n"
+ "ld1h { z4.h }, p2/Z, [x9, #-5, MUL VL]\n"
+ "inch x28\n"
+ "fmla z30.h, p2/M, z23.h, z0.h\n"
+ "fmla z31.h, p2/M, z23.h, z27.h\n"
+ "ld1h { z8.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z5.h, p2/M, z23.h, z18.h\n"
+ "fmla z29.h, p2/M, z23.h, z9.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.h, p2/M, z20.h, z27.h\n"
+ "fmla z31.h, p2/M, z20.h, z24.h\n"
+ "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla z5.h, p2/M, z20.h, z9.h\n"
+ "fmla z29.h, p2/M, z20.h, z8.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z24.h\n"
+ "fmla z31.h, p2/M, z16.h, z26.h\n"
+ "ld1h { z0.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z8.h\n"
+ "fmla z29.h, p2/M, z16.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla z30.h, p2/M, z21.h, z22.h\n"
+ "fmla z31.h, p2/M, z21.h, z19.h\n"
+ "ld1h { z26.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla z5.h, p2/M, z21.h, z10.h\n"
+ "fmla z29.h, p2/M, z21.h, z0.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #-1, MUL VL]\n"
+ "fmla z30.h, p2/M, z4.h, z19.h\n"
+ "fmla z31.h, p2/M, z4.h, z18.h\n"
+ "ld1h { z24.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla z5.h, p2/M, z4.h, z0.h\n"
+ "fmla z29.h, p2/M, z4.h, z26.h\n"
+ "ld1h { z23.h }, p2/Z, [x9]\n"
+ "fmla z30.h, p2/M, z6.h, z18.h\n"
+ "fmla z31.h, p2/M, z6.h, z9.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z6.h, z26.h\n"
+ "fmla z29.h, p2/M, z6.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z11.h, z9.h\n"
+ "fmla z31.h, p2/M, z11.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z11.h, z24.h\n"
+ "fmla z29.h, p2/M, z11.h, z27.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z8.h\n"
+ "fmla z31.h, p2/M, z16.h, z17.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z27.h\n"
+ "fmla z29.h, p2/M, z16.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.h, p2/M, z25.h, z10.h\n"
+ "fmla z31.h, p2/M, z25.h, z0.h\n"
+ "ld1h { z16.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z25.h, z18.h\n"
+ "fmla z29.h, p2/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z23.h, z0.h\n"
+ "fmla z31.h, p2/M, z23.h, z26.h\n"
+ "fmla z5.h, p2/M, z23.h, z17.h\n"
+ "fmla z29.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z21.h, z26.h\n"
+ "fmla z31.h, p2/M, z21.h, z24.h\n"
+ "fmla z5.h, p2/M, z21.h, z16.h\n"
+ "fmla z29.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z31.h, p2/M, z20.h, z27.h\n"
+ "fmla z5.h, p2/M, z20.h, z18.h\n"
+ "fmla z29.h, p2/M, z20.h, z17.h\n"
+ "fmla z30.h, p2/M, z19.h, z27.h\n"
+ "fmla z31.h, p2/M, z19.h, z22.h\n"
+ "fmax z30.h, p2/M, z30.h, z15.h\n"
+ "fmax z31.h, p2/M, z31.h, z15.h\n"
+ "fmla z5.h, p2/M, z19.h, z17.h\n"
+ "fmla z29.h, p2/M, z19.h, z16.h\n"
+ "fmax z5.h, p2/M, z5.h, z15.h\n"
+ "fmax z29.h, p2/M, z29.h, z15.h\n"
+ "fmin z30.h, p2/M, z30.h, z28.h\n"
+ "fmin z31.h, p2/M, z31.h, z28.h\n"
+ "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+ "fmin z5.h, p2/M, z5.h, z28.h\n"
+ "fmin z29.h, p2/M, z29.h, z28.h\n"
+ "st1h { z31.h }, p0, [x14, x28, LSL #1]\n"
+ "st1h { z5.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index d7b1de2062..16b96fdb8e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index a570c5aa6a..1bdef85274 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -108,10 +108,10 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"whilelt p2.s, XZR, %x[n_channels]\n"
"madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "ld1w { z18.s }, p3/Z, [x10]\n"
+ "ld1w { z27.s }, p3/Z, [x10]\n"
"add x27, x13, x13\n"
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
@@ -125,10 +125,10 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"add x28, x28, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x11, %x[n_channels]\n"
"add x23, x25, x23, LSL #2\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
"add x22, x28, x22, LSL #2\n"
"mov x21, #0x0\n"
@@ -142,175 +142,175 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
"whilelt p1.s, x11, %x[n_channels]\n"
"incw x21\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
"incw x11\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
"mov p0.b, p2.b\n"
- "ld1w { z18.s }, p3/Z, [x10]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x10]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
"incw x20\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
"addvl x9, x9, #1\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
"addvl x26, x26, #1\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
"ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
"ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
"ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
"ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
"addvl x25, x25, #1\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
"ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
"ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
"ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
"whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
"cmp x11, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
"ld1w { z10.s }, p1/Z, [x9]\n"
"ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
"ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
- "st1w { z28.s }, p0, [x28]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
"addvl x23, x23, #1\n"
- "st1w { z29.s }, p0, [x28, x12, LSL #2]\n"
+ "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z22.s }, p0, [x22]\n"
"addvl x28, x28, #1\n"
"ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
"addvl x10, x10, #-6\n"
- "st1w { z31.s }, p0, [x22, x12, LSL #2]\n"
+ "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
"ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
"add x21, x10, #0x1\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
"csel x10, x10, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
"csel x14, x14, XZR, LT\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
"cmp x10, x20\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "st1w { z28.s }, p0, [x28]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z29.s }, p0, [x28, x12, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 903de0d309..873b4736ff 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -87,7 +87,7 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z18.s }, p3/Z, [x16]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
@@ -98,99 +98,99 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x9, LSL #2]\n"
"addvl x16, x16, #-6\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x23, [x15, #0x60]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
"whilelt p1.s, x14, %x[n_channels]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ldr x23, [x15, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
"incw x28\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
"mov p0.b, p2.b\n"
- "ld1w { z18.s }, p3/Z, [x16]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
"incw x9\n"
- "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z12.s }, p1/Z, [x21, x14, LSL #2]\n"
"incw x14\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
"ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
"ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
@@ -199,98 +199,98 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"cmp x14, %x[n_channels]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x21, [x15, #0x30]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x20, [x15, #0x38]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x24, [x15, #0x58]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
"ldr x23, [x15, #0x60]\n"
"ldr x22, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
"ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
"incw x28\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z31.s, p3/M, z7.s, z12.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 41ad193364..e4f432c9ed 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 3;
sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(3, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index cda34358f5..015d0e63c2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x10, x23, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z18.s }, p3/Z, [x13]\n"
+ "ld1w { z14.s }, p3/Z, [x13]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
@@ -129,10 +129,10 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
"add x24, x11, x21, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"cmp x15, %x[n_channels]\n"
"add x23, x24, x21, LSL #2\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
"add x22, x16, x16\n"
"mov x21, #0x0\n"
@@ -146,131 +146,131 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x15, %x[n_channels]\n"
"incw x21\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
"incw x15\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
"incw x20\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "ld1w { z18.s }, p3/Z, [x13]\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x26, x17, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x27, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z14.s }, p3/Z, [x13]\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z22.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x10]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z23.s\n"
+ "fmla z29.s, p3/M, z8.s, z23.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
"addvl x10, x10, #1\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
"addvl x28, x28, #1\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
"addvl x14, x14, #1\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z19.s\n"
"ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x14]\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x9]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
"ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z27.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
"addvl x9, x9, #1\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
"ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x12, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
"addvl x26, x26, #1\n"
"ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
@@ -279,182 +279,182 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"cmp x15, %x[n_channels]\n"
"ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
"addvl x13, x13, #16\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
"ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
"ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x26]\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
"ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
- "st1w { z23.s }, p0, [x11]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "st1w { z24.s }, p0, [x11, x16, LSL #2]\n"
+ "st1w { z28.s }, p0, [x11]\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "fmin z20.s, p3/M, z20.s, z30.s\n"
+ "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z25.s }, p0, [x11, x22, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
"addvl x11, x11, #1\n"
"ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
"st1w { z26.s }, p0, [x24]\n"
"addvl x13, x13, #-6\n"
- "st1w { z27.s }, p0, [x24, x16, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
"addvl x24, x24, #1\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x16, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x8, x8, #0x1\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
"cmp x8, x20\n"
"add x21, x13, #0x1\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
"csel x13, x13, x21, LT\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
"mov p0.b, p2.b\n"
"csel x8, x8, XZR, LT\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
"cmp x13, x20\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x26, x17, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x17, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x27, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x17, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x12, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p0, [x11]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "st1w { z24.s }, p0, [x11, x16, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z25.s }, p0, [x11, x22, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z22.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x10]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z23.s\n"
+ "fmla z29.s, p3/M, z8.s, z23.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x9]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "fmla z20.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "st1w { z28.s }, p0, [x11]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "fmin z20.s, p3/M, z20.s, z30.s\n"
"st1w { z26.s }, p0, [x24]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z27.s }, p0, [x24, x16, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
- "st1w { z29.s }, p0, [x23]\n"
- "st1w { z30.s }, p0, [x23, x16, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 2eed8cb0c4..4809b0c45c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -88,390 +88,390 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "cntw x15\n"
- "mov x14, #0x0\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z14.s }, p3/Z, [x8]\n"
+ "cntw x16\n"
+ "mov x15, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [x16, #0x20]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x12, x14, LSL #2]\n"
- "addvl x17, x17, #-6\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "sub x14, XZR, x16\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "incw x13\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "incw x14\n"
"mov p1.b, p2.b\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x23, [x27, #0x0]\n"
- "whilelt p0.s, x15, %x[n_channels]\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "ldr x10, [x13, #0x0]\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x22, [x27, #0x8]\n"
- "ldr x21, [x27, #0x10]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x27, #0x18]\n"
- "ld1w { z18.s }, p3/Z, [x17]\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ldr x28, [x16, #0x20]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ldr x23, [x27, #0x20]\n"
- "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "incw x14\n"
- "ld1w { z9.s }, p0/Z, [x12, x15, LSL #2]\n"
- "ld1w { z10.s }, p0/Z, [x11, x15, LSL #2]\n"
- "ld1w { z11.s }, p0/Z, [x10, x15, LSL #2]\n"
- "ld1w { z12.s }, p0/Z, [x9, x15, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "ld1w { z13.s }, p0/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "ldr x9, [x13, #0x8]\n"
+ "ldr x28, [x13, #0x10]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "ldr x27, [x13, #0x18]\n"
+ "ld1w { z14.s }, p3/Z, [x8]\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z0.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z1.s, z19.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x20]\n"
+ "fmla z22.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z22.s, p3/M, z2.s, z17.s\n"
+ "ldr x24, [x13, #0x20]\n"
+ "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
"incw x15\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "st1w { z25.s }, p1, [x21, x13, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z27.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "whilelt p2.s, x14, %x[n_channels]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "st1w { z28.s }, p1, [x22, x13, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1w { z29.s }, p1, [x21, x13, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1w { z30.s }, p1, [x20, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1w { z31.s }, p1, [x23, x13, LSL #2]\n"
+ "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "incw x16\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmax z23.s, p3/M, z23.s, z31.s\n"
+ "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x26, [x16, #0x30]\n"
- "ldr x25, [x16, #0x38]\n"
- "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x24, [x16, #0x28]\n"
- "ldr x11, [x16, #0x48]\n"
- "fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x28, [x16, #0x60]\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x26, [x16, #0x70]\n"
- "ldr x11, [x16, #0x88]\n"
- "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "incw x13\n"
- "mov p1.b, p2.b\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "incw x14\n"
+ "mov p0.b, p2.b\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x23, [x27, #0x0]\n"
- "ldr x22, [x27, #0x8]\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x24, [x16, #0x68]\n"
- "ldr x25, [x16, #0x78]\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "ldr x10, [x13, #0x0]\n"
+ "ldr x9, [x13, #0x8]\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x21, [x27, #0x10]\n"
- "ldr x20, [x27, #0x18]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x28, [x16, #0xa0]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z26.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z12.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z3.s, z10.s\n"
- "ldr x26, [x16, #0xb0]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z27.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmax z24.s, p3/M, z24.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z17.s\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z5.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ldr x23, [x27, #0x20]\n"
- "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmax z25.s, p3/M, z25.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z13.s\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmax z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z25.s }, p1, [x21, x13, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z17.s\n"
- "fmax z30.s, p3/M, z30.s, z17.s\n"
- "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x22, [x27, #0x28]\n"
- "fmax z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z27.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x21, [x27, #0x30]\n"
- "ldr x20, [x27, #0x38]\n"
- "ldr x23, [x27, #0x40]\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "st1w { z28.s }, p1, [x22, x13, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z29.s }, p1, [x21, x13, LSL #2]\n"
- "st1w { z30.s }, p1, [x20, x13, LSL #2]\n"
- "st1w { z31.s }, p1, [x23, x13, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "ldr x28, [x13, #0x10]\n"
+ "ldr x27, [x13, #0x18]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z0.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z1.s, z19.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z22.s, p3/M, z2.s, z17.s\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
+ "fmax z23.s, p3/M, z23.s, z31.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
+ "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
+ "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 6073b2ba7d..38b377509e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 4;
sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(4, 3, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index cdf77a1cf0..35445595f8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -113,7 +113,7 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mul x21, x21, x25\n" // offset *= kernel_stride * output_size
"add x8, x8, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
"add x13, x8, x23, LSL #2\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
+ "ld1w { z19.s }, p3/Z, [x17]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"add x12, x13, x23, LSL #2\n"
"add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
@@ -132,8 +132,8 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"add x27, x10, x5\n"
"add x26, x9, x22, LSL #2\n"
"add x25, x6, x6\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"cmp x16, %x[n_channels]\n"
"add x24, x28, x23, LSL #2\n"
"ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
@@ -149,500 +149,500 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"addvl x17, x17, #-6\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
"whilelt p1.s, x16, %x[n_channels]\n"
"incw x21\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z19\n fmla z21.s, p3/M, z3.s, z9.s\n"
+ "movprfx z22, z19\n fmla z22.s, p3/M, z1.s, z9.s\n"
"incw x16\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
"incw x20\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z13, z19\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "movprfx z17, z19\n fmla z17.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z19\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "movprfx z18, z19\n fmla z18.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z29.s }, p2/Z, [x24]\n"
"ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z21.s, p3/M, z4.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "movprfx z23, z19\n fmla z23.s, p3/M, z6.s, z29.s\n"
"ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "fmla z14.s, p3/M, z7.s, z9.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z6.s, z12.s\n"
+ "movprfx z26, z19\n fmla z26.s, p3/M, z3.s, z12.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z9.s\n"
"ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z19\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p3/Z, [x17]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z18.s, p3/M, z5.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x28]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z28.s, p3/M, z3.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z14.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z13.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z14.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z13.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z14.s, p3/M, z3.s, z11.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z22.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "fmla z13.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z21.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
"fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "fmla z14.s, p3/M, z6.s, z12.s\n"
"fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z12.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "fmla z13.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z27.s, p3/M, z3.s, z9.s\n"
+ "fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z13.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x11]\n"
+ "fmla z25.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
"fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x14, LSL #2]\n"
- "addvl x8, x8, #1\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x7, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x27, LSL #2]\n"
- "addvl x12, x12, #1\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
"addvl x11, x11, #1\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z12.s\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z10.s\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
"fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x7, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x14, LSL #2]\n"
"fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x5, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x5, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
"addvl x24, x24, #1\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
"addvl x13, x13, #1\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x5, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z13.s, p3/M, z13.s, z15.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z14.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z21.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmax z14.s, p3/M, z14.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmla z20.s, p3/M, z8.s, z0.s\n"
+ "fmla z28.s, p3/M, z7.s, z0.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmla z29.s, p3/M, z5.s, z0.s\n"
+ "fmla z24.s, p3/M, z4.s, z0.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
"ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
"ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
"ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
"ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z13.s, p3/M, z13.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z16.s\n"
"ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
"addvl x17, x17, #16\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
"ld1w { z10.s }, p1/Z, [x8]\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z14.s, p3/M, z14.s, z16.s\n"
"ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "st1w { z16.s }, p0, [x15]\n"
+ "fmin z21.s, p3/M, z21.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z31.s }, p0, [x15]\n"
"ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "st1w { z17.s }, p0, [x15, x6, LSL #2]\n"
+ "fmin z18.s, p3/M, z18.s, z16.s\n"
+ "fmin z22.s, p3/M, z22.s, z16.s\n"
+ "st1w { z13.s }, p0, [x15, x6, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "st1w { z18.s }, p0, [x15, x25, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z19.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z20.s, p3/M, z20.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z17.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "st1w { z30.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z27.s }, p0, [x9]\n"
"addvl x28, x28, #1\n"
- "st1w { z20.s }, p0, [x9]\n"
+ "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
"addvl x15, x15, #1\n"
- "st1w { z21.s }, p0, [x9, x6, LSL #2]\n"
+ "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
"addvl x17, x17, #-6\n"
- "st1w { z22.s }, p0, [x9, x25, LSL #2]\n"
- "st1w { z23.s }, p0, [x9, x22, LSL #2]\n"
+ "st1w { z26.s }, p0, [x9, x22, LSL #2]\n"
"addvl x9, x9, #1\n"
- "st1w { z24.s }, p0, [x26]\n"
- "st1w { z25.s }, p0, [x26, x6, LSL #2]\n"
- "st1w { z26.s }, p0, [x26, x25, LSL #2]\n"
- "st1w { z27.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z22.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x22, LSL #2]\n"
"addvl x26, x26, #1\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x6, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x25, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23]\n"
+ "st1w { z25.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z29.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z13, z19\n fmla z13.s, p3/M, z1.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"add x4, x4, #0x1\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
"cmp x4, x20\n"
"add x21, x16, #0x1\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "movprfx z18, z19\n fmla z18.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z6.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"csel x16, x16, x21, LT\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z17, z19\n fmla z17.s, p3/M, z5.s, z9.s\n"
+ "movprfx z26, z19\n fmla z26.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
"mov p0.b, p2.b\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24]\n"
- "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z19\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z29.s }, p2/Z, [x24]\n"
+ "ld1w { z21.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z2.s, z12.s\n"
"csel x4, x4, XZR, LT\n"
"cmp x16, x20\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x5, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x8, x7, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "movprfx z10, z19\n fmla z10.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z14.s, p3/M, z7.s, z9.s\n"
"fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x14, LSL #2]\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x7, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x27, LSL #2]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x27, LSL #2]\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x7, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x5, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x5, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
- "st1w { z16.s }, p0, [x15]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
- "st1w { z17.s }, p0, [x15, x6, LSL #2]\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "st1w { z18.s }, p0, [x15, x25, LSL #2]\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "st1w { z19.s }, p0, [x15, x22, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "st1w { z20.s }, p0, [x9]\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "st1w { z21.s }, p0, [x9, x6, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "st1w { z22.s }, p0, [x9, x25, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "st1w { z23.s }, p0, [x9, x22, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z24.s }, p0, [x26]\n"
- "st1w { z25.s }, p0, [x26, x6, LSL #2]\n"
- "st1w { z26.s }, p0, [x26, x25, LSL #2]\n"
- "st1w { z27.s }, p0, [x26, x22, LSL #2]\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x6, LSL #2]\n"
- "st1w { z30.s }, p0, [x23, x25, LSL #2]\n"
- "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "movprfx z11, z19\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z25, z19\n fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x8, x5, LSL #2]\n"
+ "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z21.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z21.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "movprfx z12, z19\n fmla z12.s, p3/M, z1.s, z9.s\n"
+ "movprfx z23, z19\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z17.s, p3/M, z8.s, z9.s\n"
+ "fmla z26.s, p3/M, z5.s, z9.s\n"
+ "fmla z10.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z29.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z22.s\n"
+ "fmla z18.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z21.s\n"
+ "fmla z27.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z7.s, z29.s\n"
+ "fmla z11.s, p3/M, z6.s, z29.s\n"
+ "fmla z13.s, p3/M, z5.s, z29.s\n"
+ "fmla z20.s, p3/M, z4.s, z29.s\n"
+ "fmla z25.s, p3/M, z3.s, z29.s\n"
+ "fmla z12.s, p3/M, z2.s, z29.s\n"
+ "fmla z23.s, p3/M, z1.s, z29.s\n"
+ "fmla z24.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "fmla z26.s, p3/M, z6.s, z19.s\n"
+ "fmla z10.s, p3/M, z3.s, z19.s\n"
+ "fmla z14.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z22.s\n"
+ "fmla z11.s, p3/M, z2.s, z22.s\n"
+ "fmla z18.s, p3/M, z4.s, z21.s\n"
+ "ld1w { z29.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z21.s\n"
+ "fmla z14.s, p3/M, z2.s, z29.s\n"
+ "fmla z31.s, p3/M, z5.s, z21.s\n"
+ "fmla z18.s, p3/M, z5.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z29.s\n"
+ "fmla z27.s, p3/M, z3.s, z29.s\n"
+ "fmla z30.s, p3/M, z1.s, z29.s\n"
+ "fmla z11.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z10.s, p3/M, z7.s, z19.s\n"
+ "fmla z12.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z22.s\n"
+ "fmla z14.s, p3/M, z3.s, z22.s\n"
+ "fmla z26.s, p3/M, z1.s, z22.s\n"
+ "fmla z13.s, p3/M, z0.s, z22.s\n"
+ "fmla z31.s, p3/M, z7.s, z22.s\n"
+ "fmla z18.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z29.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z21.s\n"
+ "fmla z27.s, p3/M, z7.s, z21.s\n"
+ "fmla z30.s, p3/M, z5.s, z21.s\n"
+ "fmla z11.s, p3/M, z4.s, z21.s\n"
+ "fmla z20.s, p3/M, z2.s, z21.s\n"
+ "fmla z25.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z22.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "fmla z17.s, p3/M, z7.s, z19.s\n"
+ "fmla z14.s, p3/M, z6.s, z19.s\n"
+ "fmla z26.s, p3/M, z4.s, z19.s\n"
+ "fmla z13.s, p3/M, z3.s, z19.s\n"
+ "fmla z10.s, p3/M, z1.s, z19.s\n"
+ "fmla z12.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z29.s\n"
+ "fmla z18.s, p3/M, z1.s, z29.s\n"
+ "fmla z28.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x12]\n"
+ "fmla z23.s, p3/M, z2.s, z21.s\n"
+ "fmla z27.s, p3/M, z0.s, z22.s\n"
+ "fmla z17.s, p3/M, z3.s, z29.s\n"
+ "fmla z26.s, p3/M, z0.s, z29.s\n"
+ "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "fmla z11.s, p3/M, z7.s, z21.s\n"
+ "fmla z20.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z22.s\n"
+ "fmla z28.s, p3/M, z1.s, z22.s\n"
+ "ld1w { z21.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x11]\n"
+ "fmla z12.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z8.s, z21.s\n"
+ "fmla z11.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z17.s, p3/M, z6.s, z29.s\n"
+ "fmla z26.s, p3/M, z3.s, z29.s\n"
+ "fmla z10.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z9.s\n"
+ "fmla z12.s, p3/M, z7.s, z22.s\n"
+ "fmla z23.s, p3/M, z6.s, z22.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z13.s, p3/M, z7.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z10.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z9.s\n"
+ "fmla z12.s, p3/M, z5.s, z21.s\n"
+ "fmla z23.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z3.s, z21.s\n"
+ "fmla z11.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z10.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x13, x5, LSL #2]\n"
+ "fmla z13.s, p3/M, z8.s, z21.s\n"
+ "fmla z20.s, p3/M, z7.s, z21.s\n"
+ "fmla z25.s, p3/M, z6.s, z21.s\n"
+ "fmla z12.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z29.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z19.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z22.s\n"
+ "fmla z18.s, p3/M, z3.s, z22.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmla z17.s, p3/M, z1.s, z22.s\n"
+ "fmla z14.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmla z28.s, p3/M, z5.s, z29.s\n"
+ "fmla z27.s, p3/M, z4.s, z29.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmla z30.s, p3/M, z2.s, z29.s\n"
+ "fmla z11.s, p3/M, z1.s, z29.s\n"
+ "fmax z14.s, p3/M, z14.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z26.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z6.s, z21.s\n"
+ "fmax z11.s, p3/M, z11.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmla z10.s, p3/M, z4.s, z21.s\n"
+ "fmla z12.s, p3/M, z3.s, z21.s\n"
+ "fmax z13.s, p3/M, z13.s, z15.s\n"
+ "fmax z10.s, p3/M, z10.s, z15.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmax z12.s, p3/M, z12.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z31.s }, p0, [x15]\n"
+ "fmin z18.s, p3/M, z18.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z18.s }, p0, [x15, x6, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z16.s\n"
+ "st1w { z28.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z14.s, p3/M, z14.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z27.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z11.s, p3/M, z11.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z17.s }, p0, [x9]\n"
+ "fmin z13.s, p3/M, z13.s, z16.s\n"
+ "fmin z20.s, p3/M, z20.s, z16.s\n"
+ "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z10.s, p3/M, z10.s, z16.s\n"
+ "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+ "fmin z12.s, p3/M, z12.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "st1w { z11.s }, p0, [x9, x22, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z10.s }, p0, [x23]\n"
+ "st1w { z12.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
@@ -653,4 +653,4 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 0b04ae064d..3db248924f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -99,616 +99,616 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "cntw x15\n"
- "mov x14, #0x0\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z17.s }, p3/Z, [x7]\n"
+ "cntw x17\n"
+ "mov x16, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "sub x13, XZR, x15\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "ld1w { z9.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "sub x15, XZR, x17\n"
+ "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "incw x13\n"
+ "movprfx z20, z17\n fmla z20.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z24, z17\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z31, z17\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "movprfx z22, z17\n fmla z22.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z27, z17\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z14, z17\n fmla z14.s, p3/M, z5.s, z9.s\n"
+ "movprfx z23, z17\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z25.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z28.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z8.s, z12.s\n"
+ "incw x15\n"
"mov p1.b, p2.b\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "whilelt p0.s, x15, %x[n_channels]\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "movprfx z15, z17\n fmla z15.s, p3/M, z6.s, z28.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z20.s, p3/M, z7.s, z25.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z13, z17\n fmla z13.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z28, z17\n fmla z28.s, p3/M, z8.s, z21.s\n"
+ "fmla z24.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.s, p3/M, z4.s, z25.s\n"
+ "fmla z31.s, p3/M, z3.s, z25.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z18, z17\n fmla z18.s, p3/M, z1.s, z25.s\n"
+ "movprfx z21, z17\n fmla z21.s, p3/M, z0.s, z25.s\n"
+ "whilelt p0.s, x17, %x[n_channels]\n"
+ "ld1w { z17.s }, p3/Z, [x7]\n"
+ "fmla z14.s, p3/M, z8.s, z25.s\n"
+ "fmla z23.s, p3/M, z5.s, z25.s\n"
+ "fmla z15.s, p3/M, z2.s, z25.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z22.s, p3/M, z0.s, z12.s\n"
+ "fmla z27.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z20.s, p3/M, z8.s, z10.s\n"
+ "fmla z9.s, p3/M, z1.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z24.s, p3/M, z7.s, z10.s\n"
+ "fmla z11.s, p3/M, z6.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z13.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
"fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x9, [x16, #0x118]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldp x12, x11, [x16, #0x0]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z26.s, p3/M, z3.s, z25.s\n"
+ "fmla z14.s, p3/M, z0.s, z25.s\n"
+ "fmla z23.s, p3/M, z6.s, z29.s\n"
+ "fmla z15.s, p3/M, z3.s, z29.s\n"
+ "ld1w { z25.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z22.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
"fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "st1w { z16.s }, p1, [x23, x13, LSL #2]\n"
- "st1w { z17.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
+ "fmla z9.s, p3/M, z5.s, z12.s\n"
+ "fmla z11.s, p3/M, z2.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z13.s, p3/M, z8.s, z25.s\n"
+ "fmla z28.s, p3/M, z5.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z14.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z29.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z27.s, p3/M, z4.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "fmla z9.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "fmla z11.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z15.s, p3/M, z7.s, z25.s\n"
+ "fmla z18.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z26.s, p3/M, z7.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z29.s\n"
+ "fmla z14.s, p3/M, z4.s, z29.s\n"
+ "fmla z20.s, p3/M, z3.s, z29.s\n"
+ "fmla z23.s, p3/M, z1.s, z29.s\n"
+ "fmla z30.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z27.s, p3/M, z8.s, z10.s\n"
+ "fmla z21.s, p3/M, z8.s, z25.s\n"
+ "fmla z28.s, p3/M, z7.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z1.s, z10.s\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.s, p3/M, z7.s, z10.s\n"
+ "fmla z24.s, p3/M, z5.s, z10.s\n"
+ "fmla z11.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z26.s, p3/M, z2.s, z29.s\n"
+ "fmla z22.s, p3/M, z1.s, z29.s\n"
+ "fmla z27.s, p3/M, z0.s, z29.s\n"
+ "fmla z14.s, p3/M, z7.s, z25.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z20.s, p3/M, z6.s, z25.s\n"
+ "fmla z23.s, p3/M, z4.s, z25.s\n"
+ "fmla z30.s, p3/M, z3.s, z25.s\n"
+ "fmla z15.s, p3/M, z1.s, z25.s\n"
+ "fmla z18.s, p3/M, z0.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z25.s\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z21.s, p3/M, z2.s, z25.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z9.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z26.s, p3/M, z6.s, z29.s\n"
+ "fmla z14.s, p3/M, z3.s, z29.s\n"
+ "fmla z23.s, p3/M, z0.s, z29.s\n"
+ "fmla z24.s, p3/M, z8.s, z25.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z5.s, z25.s\n"
+ "fmla z28.s, p3/M, z1.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z2.s, z12.s\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z15.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z25.s\n"
+ "fmla z21.s, p3/M, z3.s, z25.s\n"
+ "fmla z9.s, p3/M, z8.s, z12.s\n"
+ "fmla z11.s, p3/M, z5.s, z12.s\n"
+ "fmla z14.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z6.s, z25.s\n"
+ "fmla z15.s, p3/M, z5.s, z25.s\n"
+ "fmla z13.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z29.s\n"
+ "fmla z21.s, p3/M, z6.s, z29.s\n"
+ "fmla z23.s, p3/M, z8.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z8.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z25.s\n"
+ "fmla z31.s, p3/M, z7.s, z25.s\n"
+ "fmla z13.s, p3/M, z6.s, z25.s\n"
+ "fmla z18.s, p3/M, z5.s, z25.s\n"
+ "fmla z21.s, p3/M, z4.s, z25.s\n"
+ "fmla z28.s, p3/M, z3.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldp x27, x26, [x8, #0x0]\n"
+ "fmla z11.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z29.s\n"
+ "fmax z26.s, p3/M, z26.s, z16.s\n"
+ "fmla z22.s, p3/M, z3.s, z29.s\n"
+ "fmla z27.s, p3/M, z5.s, z25.s\n"
+ "fmax z22.s, p3/M, z22.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z16.s\n"
+ "fmla z9.s, p3/M, z4.s, z25.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmax z9.s, p3/M, z9.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z19.s\n"
+ "fmla z21.s, p3/M, z7.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z19.s\n"
+ "fmla z14.s, p3/M, z1.s, z29.s\n"
+ "fmla z20.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z19.s\n"
+ "fmla z24.s, p3/M, z2.s, z25.s\n"
+ "fmla z11.s, p3/M, z1.s, z25.s\n"
+ "fmin z9.s, p3/M, z9.s, z19.s\n"
+ "fmax z14.s, p3/M, z14.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmax z20.s, p3/M, z20.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z16.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "fmla z13.s, p3/M, z7.s, z12.s\n"
+ "fmax z11.s, p3/M, z11.s, z16.s\n"
+ "st1w { z26.s }, p1, [x12, x15, LSL #2]\n"
+ "st1w { z22.s }, p1, [x11, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z15.s, p3/M, z4.s, z10.s\n"
+ "st1w { z27.s }, p1, [x10, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "st1w { z9.s }, p1, [x9, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla z28.s, p3/M, z4.s, z12.s\n"
- "st1w { z18.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "st1w { z19.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldp x10, x9, [x16, #0x10]\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
- "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x58]\n"
- "incw x14\n"
- "ld1w { z9.s }, p0/Z, [x12, x15, LSL #2]\n"
- "ld1w { z10.s }, p0/Z, [x11, x15, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "ld1w { z11.s }, p0/Z, [x10, x15, LSL #2]\n"
- "ld1w { z12.s }, p0/Z, [x9, x15, LSL #2]\n"
- "incw x15\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x70]\n"
- "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x78]\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "whilelt p2.s, x14, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
- "cmp x15, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "addvl x17, x17, #-6\n"
- "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ldp x25, x24, [x8, #0x10]\n"
+ "fmin z14.s, p3/M, z14.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z14.s }, p1, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z19.s\n"
+ "fmin z11.s, p3/M, z11.s, z19.s\n"
+ "st1w { z20.s }, p1, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z23.s, p3/M, z23.s, z16.s\n"
+ "fmax z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z24.s }, p1, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z31.s, p3/M, z31.s, z16.s\n"
+ "fmax z13.s, p3/M, z13.s, z16.s\n"
+ "st1w { z11.s }, p1, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "incw x16\n"
+ "ld1w { z9.s }, p0/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x26, x17, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z19.s\n"
+ "ld1w { z11.s }, p0/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x17, LSL #2]\n"
+ "incw x17\n"
+ "fmin z30.s, p3/M, z30.s, z19.s\n"
+ "fmin z31.s, p3/M, z31.s, z19.s\n"
+ "fmin z13.s, p3/M, z13.s, z19.s\n"
+ "st1w { z23.s }, p1, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmax z15.s, p3/M, z15.s, z16.s\n"
+ "fmax z18.s, p3/M, z18.s, z16.s\n"
+ "st1w { z30.s }, p1, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z21.s, p3/M, z21.s, z16.s\n"
+ "fmax z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z31.s }, p1, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1w { z13.s }, p1, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "whilelt p2.s, x16, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "fmin z18.s, p3/M, z18.s, z19.s\n"
+ "fmin z21.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "fmin z28.s, p3/M, z28.s, z19.s\n"
+ "st1w { z15.s }, p1, [x23, x15, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "st1w { z18.s }, p1, [x22, x15, LSL #2]\n"
+ "st1w { z21.s }, p1, [x21, x15, LSL #2]\n"
+ "st1w { z28.s }, p1, [x20, x15, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
- "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x27, [x16, #0x20]\n"
- "ldr x26, [x16, #0x30]\n"
- "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x25, [x16, #0x28]\n"
- "ldr x24, [x16, #0x38]\n"
- "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "ldr x12, [x16, #0x40]\n"
- "ldr x11, [x16, #0x48]\n"
- "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x10, [x16, #0x50]\n"
- "ldr x9, [x16, #0x58]\n"
- "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
- "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ldr x27, [x16, #0x60]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "incw x13\n"
- "mov p1.b, p2.b\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x88]\n"
- "fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x23, [x28, #0x0]\n"
- "ldr x22, [x28, #0x8]\n"
- "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0x78]\n"
- "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0x80]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z12.s\n"
- "fmla z17.s, p3/M, z0.s, z12.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x90]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z23.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0x98]\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "ldr x12, [x16, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z1.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
- "ldr x9, [x16, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z21.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z23.s, p3/M, z4.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "movprfx z14, z17\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z18, z17\n fmla z18.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z15, z17\n fmla z15.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z20, z17\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "movprfx z13, z17\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z22, z17\n fmla z22.s, p3/M, z6.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z27, z17\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z17\n fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x13, [x8, #0x70]\n"
"fmla z18.s, p3/M, z0.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x12, [x16, #0x100]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z21.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ld1w { z25.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "incw x15\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "movprfx z28, z17\n fmla z28.s, p3/M, z6.s, z21.s\n"
+ "ld1w { z29.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z14.s, p3/M, z7.s, z23.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z10, z17\n fmla z10.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z25.s\n"
+ "fmla z15.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.s, p3/M, z4.s, z23.s\n"
+ "fmla z20.s, p3/M, z3.s, z23.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z25, z17\n fmla z25.s, p3/M, z1.s, z23.s\n"
+ "movprfx z24, z17\n fmla z24.s, p3/M, z0.s, z23.s\n"
+ "fmla z27.s, p3/M, z8.s, z23.s\n"
+ "fmla z31.s, p3/M, z5.s, z23.s\n"
+ "fmla z28.s, p3/M, z2.s, z23.s\n"
"fmla z18.s, p3/M, z1.s, z12.s\n"
- "fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x14, LSL #2]\n"
- "ldr x11, [x16, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x14, LSL #2]\n"
- "ldr x10, [x16, #0x110]\n"
- "fmla z23.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x9, [x16, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z8.s, z12.s\n"
- "fmla z23.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z11.s\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z27.s, p3/M, z5.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmax z16.s, p3/M, z16.s, z14.s\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmax z17.s, p3/M, z17.s, z14.s\n"
- "fmax z18.s, p3/M, z18.s, z14.s\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmax z19.s, p3/M, z19.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z13.s\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
- "fmin z17.s, p3/M, z17.s, z13.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x14, LSL #2]\n"
- "fmin z18.s, p3/M, z18.s, z13.s\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z11.s\n"
- "fmin z19.s, p3/M, z19.s, z13.s\n"
- "fmax z20.s, p3/M, z20.s, z14.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmax z21.s, p3/M, z21.s, z14.s\n"
- "fmax z22.s, p3/M, z22.s, z14.s\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmax z23.s, p3/M, z23.s, z14.s\n"
- "st1w { z16.s }, p1, [x23, x13, LSL #2]\n"
- "st1w { z17.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x23, [x28, #0x20]\n"
- "ldr x22, [x28, #0x28]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "st1w { z18.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x30]\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "st1w { z19.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x38]\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmin z20.s, p3/M, z20.s, z13.s\n"
- "fmin z21.s, p3/M, z21.s, z13.s\n"
- "fmin z22.s, p3/M, z22.s, z13.s\n"
- "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x40]\n"
- "fmin z23.s, p3/M, z23.s, z13.s\n"
- "fmax z24.s, p3/M, z24.s, z14.s\n"
- "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x48]\n"
- "fmax z25.s, p3/M, z25.s, z14.s\n"
- "fmax z26.s, p3/M, z26.s, z14.s\n"
- "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x50]\n"
- "fmax z27.s, p3/M, z27.s, z14.s\n"
- "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x58]\n"
- "fmin z24.s, p3/M, z24.s, z13.s\n"
- "fmin z25.s, p3/M, z25.s, z13.s\n"
- "fmin z26.s, p3/M, z26.s, z13.s\n"
- "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
- "ldr x23, [x28, #0x60]\n"
- "fmin z27.s, p3/M, z27.s, z13.s\n"
- "fmax z28.s, p3/M, z28.s, z14.s\n"
- "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
- "ldr x22, [x28, #0x68]\n"
- "fmax z29.s, p3/M, z29.s, z14.s\n"
- "fmax z30.s, p3/M, z30.s, z14.s\n"
- "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
- "ldr x21, [x28, #0x70]\n"
- "fmax z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x28, #0x78]\n"
- "fmin z28.s, p3/M, z28.s, z13.s\n"
- "fmin z29.s, p3/M, z29.s, z13.s\n"
- "fmin z30.s, p3/M, z30.s, z13.s\n"
- "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z13.s\n"
- "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z14.s, p3/M, z8.s, z29.s\n"
+ "fmla z9.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z15.s, p3/M, z7.s, z29.s\n"
+ "fmla z11.s, p3/M, z6.s, z29.s\n"
+ "fmla z30.s, p3/M, z5.s, z29.s\n"
+ "fmla z20.s, p3/M, z4.s, z29.s\n"
+ "fmla z10.s, p3/M, z3.s, z29.s\n"
+ "fmla z25.s, p3/M, z2.s, z29.s\n"
+ "fmla z24.s, p3/M, z1.s, z29.s\n"
+ "fmla z26.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z18.s, p3/M, z3.s, z23.s\n"
+ "fmla z27.s, p3/M, z0.s, z23.s\n"
+ "fmla z31.s, p3/M, z6.s, z21.s\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z13.s, p3/M, z4.s, z29.s\n"
+ "fmla z22.s, p3/M, z3.s, z29.s\n"
+ "fmla z14.s, p3/M, z1.s, z29.s\n"
+ "fmla z9.s, p3/M, z5.s, z12.s\n"
+ "fmla z11.s, p3/M, z2.s, z12.s\n"
+ "fmla z15.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z10.s, p3/M, z8.s, z21.s\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z18.s, p3/M, z5.s, z29.s\n"
+ "fmla z27.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z13.s, p3/M, z5.s, z17.s\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z14.s, p3/M, z2.s, z17.s\n"
+ "fmla z9.s, p3/M, z3.s, z17.s\n"
+ "fmla z15.s, p3/M, z1.s, z17.s\n"
+ "fmla z11.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z28.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z18.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z6.s, z21.s\n"
+ "fmla z27.s, p3/M, z4.s, z21.s\n"
+ "fmla z14.s, p3/M, z3.s, z21.s\n"
+ "fmla z31.s, p3/M, z1.s, z21.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z22.s, p3/M, z8.s, z29.s\n"
+ "fmla z24.s, p3/M, z8.s, z23.s\n"
+ "fmla z26.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z1.s, z29.s\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.s, p3/M, z7.s, z29.s\n"
+ "fmla z15.s, p3/M, z5.s, z29.s\n"
+ "fmla z11.s, p3/M, z4.s, z29.s\n"
+ "fmla z20.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z18.s, p3/M, z2.s, z21.s\n"
+ "fmla z13.s, p3/M, z1.s, z21.s\n"
+ "fmla z22.s, p3/M, z0.s, z21.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z14.s, p3/M, z6.s, z23.s\n"
+ "fmla z31.s, p3/M, z4.s, z23.s\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z23.s\n"
+ "fmla z25.s, p3/M, z0.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z4.s, z17.s\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z13.s, p3/M, z2.s, z29.s\n"
+ "fmla z22.s, p3/M, z1.s, z29.s\n"
+ "fmla z9.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z18.s, p3/M, z6.s, z21.s\n"
+ "fmla z27.s, p3/M, z3.s, z21.s\n"
+ "fmla z31.s, p3/M, z0.s, z21.s\n"
+ "fmla z15.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.s, p3/M, z7.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ "fmla z26.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z2.s, z23.s\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z28.s, p3/M, z0.s, z29.s\n"
+ "fmla z25.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z3.s, z21.s\n"
+ "fmla z9.s, p3/M, z8.s, z23.s\n"
+ "fmla z11.s, p3/M, z5.s, z23.s\n"
+ "fmla z27.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z21.s\n"
+ "fmla z20.s, p3/M, z6.s, z21.s\n"
+ "fmla z28.s, p3/M, z5.s, z21.s\n"
+ "fmla z10.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z2.s, z23.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "fmla z20.s, p3/M, z7.s, z21.s\n"
+ "fmla z10.s, p3/M, z6.s, z21.s\n"
+ "fmla z25.s, p3/M, z5.s, z21.s\n"
+ "fmla z24.s, p3/M, z4.s, z21.s\n"
+ "fmla z26.s, p3/M, z3.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z11.s, p3/M, z8.s, z23.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmax z18.s, p3/M, z18.s, z16.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z21.s\n"
+ "fmax z13.s, p3/M, z13.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z16.s\n"
+ "fmla z9.s, p3/M, z4.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z29.s\n"
+ "fmax z9.s, p3/M, z9.s, z16.s\n"
+ "fmin z18.s, p3/M, z18.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z29.s\n"
+ "fmla z26.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmin z13.s, p3/M, z13.s, z19.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z14.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z19.s\n"
+ "fmla z15.s, p3/M, z2.s, z21.s\n"
+ "fmla z11.s, p3/M, z1.s, z21.s\n"
+ "fmin z9.s, p3/M, z9.s, z19.s\n"
+ "fmax z27.s, p3/M, z27.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z23.s\n"
+ "fmla z30.s, p3/M, z6.s, z23.s\n"
+ "fmax z14.s, p3/M, z14.s, z16.s\n"
+ "fmax z15.s, p3/M, z15.s, z16.s\n"
+ "fmla z20.s, p3/M, z8.s, z29.s\n"
+ "fmla z10.s, p3/M, z7.s, z29.s\n"
+ "fmax z11.s, p3/M, z11.s, z16.s\n"
+ "st1w { z18.s }, p0, [x12, x15, LSL #2]\n"
+ "st1w { z13.s }, p0, [x11, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z28.s, p3/M, z4.s, z23.s\n"
+ "st1w { z22.s }, p0, [x10, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z25.s, p3/M, z3.s, z23.s\n"
+ "fmla z24.s, p3/M, z5.s, z29.s\n"
+ "st1w { z9.s }, p0, [x9, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z26.s, p3/M, z4.s, z29.s\n"
+ "fmin z27.s, p3/M, z27.s, z19.s\n"
+ "fmin z14.s, p3/M, z14.s, z19.s\n"
+ "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z11.s, p3/M, z11.s, z19.s\n"
+ "fmax z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z30.s, p3/M, z30.s, z16.s\n"
+ "fmax z20.s, p3/M, z20.s, z16.s\n"
+ "st1w { z15.s }, p0, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z10.s, p3/M, z10.s, z16.s\n"
+ "st1w { z11.s }, p0, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmin z31.s, p3/M, z31.s, z19.s\n"
+ "fmin z30.s, p3/M, z30.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z31.s }, p0, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmin z10.s, p3/M, z10.s, z19.s\n"
+ "fmax z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z30.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z25.s, p3/M, z25.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z20.s }, p0, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "fmax z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z10.s }, p0, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmin z28.s, p3/M, z28.s, z19.s\n"
+ "fmin z25.s, p3/M, z25.s, z19.s\n"
+ "fmin z24.s, p3/M, z24.s, z19.s\n"
+ "st1w { z28.s }, p0, [x23, x15, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z19.s\n"
+ "st1w { z25.s }, p0, [x22, x15, LSL #2]\n"
+ "st1w { z24.s }, p0, [x21, x15, LSL #2]\n"
+ "st1w { z26.s }, p0, [x20, x15, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 17ac74e223..75d62007ab 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 2) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 5a1f309b88..e6090fda94 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -112,7 +112,7 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x28, x12, x23, LSL #2\n"
"madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z19.s }, p3/Z, [x11]\n"
+ "ld1w { z30.s }, p3/Z, [x11]\n"
"ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
@@ -128,8 +128,8 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"add x24, x26, x15\n"
"add x9, x9, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"cmp x13, %x[n_channels]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x23, x25, x23, LSL #2\n"
"add x22, x9, x21, LSL #2\n"
"ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
@@ -147,191 +147,191 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
"whilelt p1.s, x13, %x[n_channels]\n"
"incw x21\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
"incw x13\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x26, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z14.s }, p2/Z, [x25]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z15.s }, p2/Z, [x27]\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
"addvl x12, x12, #1\n"
"addvl x28, x28, #1\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x26, LSL #2]\n"
- "ld1w { z19.s }, p3/Z, [x11]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z14.s }, p2/Z, [x25, x24, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x11]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z25.s\n"
+ "fmla z21.s, p3/M, z1.s, z24.s\n"
"ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
"incw x20\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z25.s\n"
+ "fmla z22.s, p3/M, z1.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x23]\n"
"addvl x27, x27, #1\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
"ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
"ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "fmla z22.s, p3/M, z7.s, z20.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
"ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z26.s, p3/M, z7.s, z24.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
"ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z29.s\n"
+ "fmax z21.s, p3/M, z21.s, z29.s\n"
"ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #16\n"
"whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
"cmp x13, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
"ld1w { z10.s }, p1/Z, [x12]\n"
"ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z22.s, p3/M, z22.s, z28.s\n"
"ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z21.s, p3/M, z21.s, z28.s\n"
"addvl x25, x25, #1\n"
"ld1w { z14.s }, p1/Z, [x28]\n"
"ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
"addvl x23, x23, #1\n"
"ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
- "st1w { z28.s }, p0, [x9]\n"
+ "st1w { z27.s }, p0, [x9]\n"
"ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
- "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
+ "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
"addvl x9, x9, #1\n"
"ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
"addvl x11, x11, #-6\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x14, LSL #2]\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
"ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x26, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z14.s }, p2/Z, [x25]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
"add x16, x16, #0x1\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z15.s }, p2/Z, [x27]\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
"cmp x16, x20\n"
"add x21, x11, #0x1\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z14.s }, p2/Z, [x25, x24, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z25.s\n"
+ "fmla z21.s, p3/M, z1.s, z24.s\n"
"csel x11, x11, x21, LT\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z25.s\n"
+ "fmla z22.s, p3/M, z1.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x23]\n"
"csel x16, x16, XZR, LT\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
"ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
"cmp x11, x20\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z28.s }, p0, [x9]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z20.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "st1w { z27.s }, p0, [x9]\n"
+ "fmla z26.s, p3/M, z7.s, z24.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z29.s\n"
+ "fmax z21.s, p3/M, z21.s, z29.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z22.s, p3/M, z22.s, z28.s\n"
+ "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z28.s\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index eb6c2daa97..98427701fa 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -96,7 +96,7 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x11, x10, [x20, #0x10]\n"
"mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z19.s }, p3/Z, [x16]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"cmp x14, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
@@ -111,8 +111,8 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
@@ -126,89 +126,89 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z20.s\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
"whilelt p1.s, x14, %x[n_channels]\n"
"ldp x27, x26, [x15, #0x0]\n"
"ldp x25, x24, [x15, #0x10]\n"
"ldp x23, x22, [x15, #0x20]\n"
"incw x9\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
"ldp x21, x20, [x15, #0x30]\n"
"ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
"ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
"ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
"incw x28\n"
"ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
@@ -216,122 +216,122 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"whilelt p2.s, x9, %x[n_channels]\n"
"ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
"ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
"ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
- "ld1w { z19.s }, p3/Z, [x16]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
"cmp x14, %x[n_channels]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
"ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
"ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
"addvl x16, x16, #-6\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x27, [x15, #0x40]\n"
- "ldr x26, [x15, #0x48]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ldr x24, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x23, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x26, [x15, #0x88]\n"
- "fmla z30.s, p3/M, z3.s, z14.s\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x26, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z20.s\n"
"ldr x21, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z15.s\n"
- "ldr x25, [x15, #0x90]\n"
- "ldr x22, [x15, #0xa8]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x23, [x15, #0xa0]\n"
- "ldr x21, [x15, #0xb0]\n"
- "fmla z30.s, p3/M, z6.s, z15.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "ldr x27, [x15, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z15.s\n"
- "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
"incw x28\n"
"mov p0.b, p2.b\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 2449c96637..ae89a64c6b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
-void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
-void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
@@ -57,7 +57,7 @@ class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
constexpr static unsigned int output_cols = 2;
sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
- : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 5, 1) {}
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
@@ -68,4 +68,4 @@ class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index b4cf6c8582..075181a488 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -113,14 +113,14 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
"add x9, x11, x23, LSL #2\n"
"add x28, x15, x17\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"mul x20, x20, x24\n" // offset *= output_tile_size
"whilelt p2.s, XZR, %x[n_channels]\n"
"add x27, x9, x23, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"add x26, x28, x17\n"
"add x25, x27, x23, LSL #2\n"
- "ld1w { z16.s }, p3/Z, [x10]\n"
+ "ld1w { z29.s }, p3/Z, [x10]\n"
"ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
"add x24, x26, x17\n"
"add x13, x13, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
@@ -146,378 +146,378 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z14.s }, p2/Z, [x9]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z24.s }, p2/Z, [x11, x28, LSL #2]\n"
"whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z8.s\n"
- "ld1w { z0.s }, p3/Z, [x10]\n"
+ "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z7.s\n"
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z18.s }, p3/Z, [x10]\n"
"incw x21\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x11, x26, LSL #2]\n"
"incw x12\n"
- "fmla z30.s, p3/M, z1.s, z8.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z26.s, p3/M, z1.s, z8.s\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z22.s }, p3/Z, [x10, #1, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
"addvl x14, x14, #1\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z2.s, z24.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
"addvl x11, x11, #1\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, x17, LSL #2]\n"
"incw x20\n"
- "fmla z30.s, p3/M, z3.s, z5.s\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
- "ld1w { z9.s }, p2/Z, [x9, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z6.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z7.s\n"
- "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "fmla z26.s, p3/M, z3.s, z24.s\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z5.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z23.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z27.s, p3/M, z18.s, z7.s\n"
+ "fmla z31.s, p3/M, z18.s, z8.s\n"
"ld1w { z7.s }, p1/Z, [x11]\n"
- "fmla z30.s, p3/M, z0.s, z14.s\n"
- "fmla z31.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z8.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p2/Z, [x9, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z14.s\n"
+ "fmla z30.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.s, p3/M, z22.s, z8.s\n"
+ "fmla z31.s, p3/M, z22.s, z13.s\n"
+ "ld1w { z3.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z22.s, z0.s\n"
+ "fmla z30.s, p3/M, z22.s, z19.s\n"
+ "ld1w { z8.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z13.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z2.s }, p2/Z, [x9, x26, LSL #2]\n"
"addvl x9, x9, #1\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z26.s, p3/M, z20.s, z19.s\n"
+ "fmla z30.s, p3/M, z20.s, z5.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.s, p3/M, z3.s, z5.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x27]\n"
- "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z6.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x27, x17, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z5.s\n"
- "fmla z31.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z6.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z17.s, z24.s\n"
+ "fmla z31.s, p3/M, z17.s, z23.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z29.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z26.s, p3/M, z17.s, z5.s\n"
+ "fmla z30.s, p3/M, z17.s, z2.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z23.s\n"
+ "fmla z31.s, p3/M, z21.s, z10.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z2.s\n"
+ "fmla z30.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z27.s, p3/M, z18.s, z14.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z25.s\n"
+ "fmla z30.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z23.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z27.s, p3/M, z8.s, z0.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z24.s\n"
+ "fmla z30.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z27.s, p3/M, z16.s, z19.s\n"
+ "fmla z31.s, p3/M, z16.s, z5.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x26, LSL #2]\n"
"addvl x27, x27, #1\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z16.s, z22.s\n"
+ "fmla z30.s, p3/M, z16.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z27.s, p3/M, z17.s, z5.s\n"
+ "fmla z31.s, p3/M, z17.s, z2.s\n"
+ "ld1w { z16.s }, p2/Z, [x25]\n"
+ "fmla z26.s, p3/M, z17.s, z0.s\n"
+ "fmla z30.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z2.s\n"
+ "fmla z31.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z4.s }, p2/Z, [x25, x17, LSL #2]\n"
"ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z5.s\n"
- "fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z31.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x25, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x10]\n"
- "fmla z28.s, p3/M, z2.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z19.s\n"
+ "fmla z30.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z13.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z27.s, p3/M, z23.s, z25.s\n"
+ "fmla z31.s, p3/M, z23.s, z24.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z16.s\n"
+ "fmla z30.s, p3/M, z23.s, z4.s\n"
+ "ld1w { z5.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z22.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z20.s, z4.s\n"
+ "fmla z30.s, p3/M, z20.s, z25.s\n"
+ "ld1w { z23.s }, p3/Z, [x10]\n"
+ "fmla z27.s, p3/M, z18.s, z22.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
"addvl x25, x25, #1\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "fmla z31.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z31.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z25.s\n"
+ "fmla z30.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z27.s, p3/M, z17.s, z0.s\n"
+ "fmla z31.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "fmla z26.s, p3/M, z17.s, z24.s\n"
+ "fmla z30.s, p3/M, z17.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z27.s, p3/M, z13.s, z19.s\n"
+ "fmla z31.s, p3/M, z13.s, z1.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
"ld1w { z14.s }, p1/Z, [x9]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z13.s, z8.s\n"
+ "fmla z30.s, p3/M, z13.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z5.s, z4.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z18.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
"ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "fmla z29.s, p3/M, z1.s, z5.s\n"
+ "fmla z27.s, p3/M, z23.s, z4.s\n"
+ "fmla z31.s, p3/M, z23.s, z25.s\n"
"ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z17.s\n"
+ "fmla z30.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
"ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "fmla z29.s, p3/M, z2.s, z6.s\n"
+ "fmla z27.s, p3/M, z21.s, z25.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
"ld1w { z5.s }, p1/Z, [x14]\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z16.s\n"
+ "fmla z30.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
"ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z8.s\n"
+ "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z8.s\n"
"addvl x10, x10, #16\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z20.s, z18.s\n"
+ "fmla z30.s, p3/M, z20.s, z17.s\n"
"cmp x12, %x[n_channels]\n"
"addvl x23, x23, #1\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z9.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "fmla z27.s, p3/M, z19.s, z8.s\n"
+ "fmla z31.s, p3/M, z19.s, z22.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z26.s, p3/M, z19.s, z17.s\n"
+ "fmla z30.s, p3/M, z19.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "fmin z31.s, p3/M, z31.s, z28.s\n"
"ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
"ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
"ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
"ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
"ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
- "st1w { z28.s }, p0, [x13]\n"
- "st1w { z29.s }, p0, [x13, x16, LSL #2]\n"
+ "st1w { z27.s }, p0, [x13]\n"
+ "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
"addvl x13, x13, #1\n"
"ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
"ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z26.s }, p0, [x22]\n"
"addvl x10, x10, #-6\n"
- "st1w { z31.s }, p0, [x22, x16, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22, x16, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x11, x28, LSL #2]\n"
"ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z8.s\n"
- "ld1w { z0.s }, p3/Z, [x10]\n"
+ "movprfx z5, z29\n fmla z5.s, p3/M, z0.s, z7.s\n"
+ "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x10]\n"
"ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
"ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z30.s, p3/M, z1.s, z8.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z5.s, p3/M, z1.s, z8.s\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #1, MUL VL]\n"
"add x8, x8, #0x1\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
"cmp x8, x20\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z5.s, p3/M, z2.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z22.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #2, MUL VL]\n"
"add x21, x12, #0x1\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z1.s }, p2/Z, [x9, x17, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z30.s, p3/M, z3.s, z5.s\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z5.s, p3/M, z3.s, z22.s\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
"csel x12, x12, x21, LT\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
- "ld1w { z9.s }, p2/Z, [x9, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z6.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z4.s, z6.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z0.s, z7.s\n"
- "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "fmla z30.s, p3/M, z20.s, z7.s\n"
+ "fmla z31.s, p3/M, z20.s, z8.s\n"
"csel x8, x8, XZR, LT\n"
"cmp x12, x20\n"
- "fmla z30.s, p3/M, z0.s, z14.s\n"
- "fmla z31.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z8.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p2/Z, [x9, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z5.s, p3/M, z20.s, z14.s\n"
+ "fmla z29.s, p3/M, z20.s, z1.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z30.s, p3/M, z19.s, z8.s\n"
+ "fmla z31.s, p3/M, z19.s, z13.s\n"
+ "ld1w { z26.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z19.s, z1.s\n"
+ "fmla z29.s, p3/M, z19.s, z0.s\n"
+ "ld1w { z25.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z18.s, z13.s\n"
+ "fmla z31.s, p3/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z18.s, z0.s\n"
+ "fmla z29.s, p3/M, z18.s, z27.s\n"
+ "ld1w { z23.s }, p3/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z28.s, p3/M, z3.s, z5.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x27]\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z6.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x27, x17, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z5.s\n"
- "fmla z31.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z6.s\n"
- "fmla z31.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z9.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x17, LSL #2]\n"
- "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z5.s\n"
- "fmla z29.s, p3/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z31.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z28.s, p3/M, z1.s, z6.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p2/Z, [x25, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x10]\n"
- "fmla z28.s, p3/M, z2.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "fmla z31.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z31.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z28.s, p3/M, z0.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x28, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "fmla z29.s, p3/M, z1.s, z5.s\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "fmla z29.s, p3/M, z2.s, z6.s\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z8.s\n"
- "fmla z30.s, p3/M, z3.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z9.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x13]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x13, x16, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z17.s, z22.s\n"
+ "fmla z31.s, p3/M, z17.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x27]\n"
+ "fmla z5.s, p3/M, z17.s, z27.s\n"
+ "fmla z29.s, p3/M, z17.s, z24.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z6.s\n"
+ "fmla z31.s, p3/M, z16.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z24.s\n"
+ "fmla z29.s, p3/M, z16.s, z26.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z14.s\n"
+ "fmla z31.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z21.s, z22.s\n"
+ "fmla z29.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z30.s, p3/M, z25.s, z1.s\n"
+ "fmla z31.s, p3/M, z25.s, z0.s\n"
+ "ld1w { z7.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z25.s, z19.s\n"
+ "fmla z29.s, p3/M, z25.s, z18.s\n"
+ "ld1w { z10.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z30.s, p3/M, z23.s, z0.s\n"
+ "fmla z31.s, p3/M, z23.s, z27.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z23.s, z18.s\n"
+ "fmla z29.s, p3/M, z23.s, z7.s\n"
+ "ld1w { z6.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z30.s, p3/M, z20.s, z27.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z0.s }, p2/Z, [x25]\n"
+ "fmla z5.s, p3/M, z20.s, z7.s\n"
+ "fmla z29.s, p3/M, z20.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z24.s\n"
+ "fmla z31.s, p3/M, z16.s, z26.s\n"
+ "ld1w { z3.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z11.s\n"
+ "fmla z29.s, p3/M, z16.s, z17.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z22.s\n"
+ "fmla z31.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z26.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z21.s, z0.s\n"
+ "fmla z29.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z25.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z30.s, p3/M, z10.s, z19.s\n"
+ "fmla z31.s, p3/M, z10.s, z18.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z10.s, z3.s\n"
+ "fmla z29.s, p3/M, z10.s, z26.s\n"
+ "ld1w { z23.s }, p3/Z, [x10]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z6.s, z7.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z6.s, z26.s\n"
+ "fmla z29.s, p3/M, z6.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z9.s, z7.s\n"
+ "fmla z31.s, p3/M, z9.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "fmla z5.s, p3/M, z9.s, z24.s\n"
+ "fmla z29.s, p3/M, z9.s, z27.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z11.s\n"
+ "fmla z31.s, p3/M, z16.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z27.s\n"
+ "fmla z29.s, p3/M, z16.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z30.s, p3/M, z25.s, z0.s\n"
+ "fmla z31.s, p3/M, z25.s, z3.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z25.s, z18.s\n"
+ "fmla z29.s, p3/M, z25.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z23.s, z3.s\n"
+ "fmla z31.s, p3/M, z23.s, z26.s\n"
+ "fmla z5.s, p3/M, z23.s, z17.s\n"
+ "fmla z29.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z21.s, z26.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
+ "fmla z5.s, p3/M, z21.s, z16.s\n"
+ "fmla z29.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z27.s\n"
+ "fmla z5.s, p3/M, z20.s, z18.s\n"
+ "fmla z29.s, p3/M, z20.s, z17.s\n"
+ "fmla z30.s, p3/M, z19.s, z27.s\n"
+ "fmla z31.s, p3/M, z19.s, z22.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z5.s, p3/M, z19.s, z17.s\n"
+ "fmla z29.s, p3/M, z19.s, z16.s\n"
+ "fmax z5.s, p3/M, z5.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
+ "fmin z31.s, p3/M, z31.s, z28.s\n"
+ "st1w { z30.s }, p0, [x13]\n"
+ "fmin z5.s, p3/M, z5.s, z28.s\n"
+ "fmin z29.s, p3/M, z29.s, z28.s\n"
+ "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+ "st1w { z5.s }, p0, [x22]\n"
+ "st1w { z29.s }, p0, [x22, x16, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index cb70bd2b6f..bf65e04d32 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -104,448 +104,448 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"mov x13, #0x0\n"
"ldp x12, x11, [x20, #0x10]\n"
"whilelt p3.s, XZR, %x[n_channels]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "cntw x28\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "cntw x10\n"
"ptrue p2.b\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_params]]\n"
- "ld1w { z5.s }, p3/Z, [x10, x13, LSL #2]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1w { z6.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "sub x24, XZR, x28\n"
- "ldp x23, x22, [x16, #0x20]\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "ld1rw { z18.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z17.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z16.s }, p2/Z, [x27]\n"
- "ld1w { z0.s }, p2/Z, [x27, #1, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x27, #3, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x27, #4, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x27, #5, MUL VL]\n"
- "ld1w { z7.s }, p3/Z, [x26, x13, LSL #2]\n"
- "addvl x27, x27, #6\n"
- "ld1w { z8.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z13.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ld1w { z14.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1w { z5.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "sub x28, XZR, x10\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z29.s }, p2/Z, [x9]\n"
+ "ld1w { z0.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1w { z7.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "addvl x9, x9, #6\n"
+ "ld1w { z8.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ld1w { z11.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z12.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z16\n fmla z28.s, p2/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p2/M, z0.s, z6.s\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "movprfx z30, z16\n fmla z30.s, p2/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p2/M, z1.s, z8.s\n"
- "fmla z31.s, p2/M, z1.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.s, p2/M, z2.s, z9.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.s, p2/M, z2.s, z13.s\n"
- "fmla z31.s, p2/M, z2.s, z5.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.s, p2/M, z3.s, z5.s\n"
- "fmla z31.s, p2/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z6.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.s, p2/M, z0.s, z7.s\n"
- "fmla z29.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p2/M, z0.s, z14.s\n"
- "fmla z31.s, p2/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.s, p2/M, z1.s, z8.s\n"
- "fmla z29.s, p2/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.s, p2/M, z1.s, z11.s\n"
- "fmla z31.s, p2/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.s, p2/M, z2.s, z13.s\n"
- "fmla z29.s, p2/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.s, p2/M, z2.s, z12.s\n"
- "fmla z31.s, p2/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.s, p2/M, z3.s, z5.s\n"
- "fmla z29.s, p2/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.s, p2/M, z3.s, z9.s\n"
- "fmla z31.s, p2/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.s, p2/M, z4.s, z6.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z13.s\n"
- "fmla z31.s, p2/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.s, p2/M, z0.s, z14.s\n"
- "fmla z29.s, p2/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.s, p2/M, z0.s, z5.s\n"
- "fmla z31.s, p2/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.s, p2/M, z1.s, z11.s\n"
- "fmla z29.s, p2/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+ "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z6.s\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1w { z5.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
+ "movprfx z26, z29\n fmla z26.s, p2/M, z0.s, z8.s\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.s, p2/M, z1.s, z6.s\n"
- "fmla z31.s, p2/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #-5, MUL VL]\n"
- "whilelt p1.s, x28, %x[n_channels]\n"
- "fmla z28.s, p2/M, z2.s, z12.s\n"
- "fmla z29.s, p2/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.s, p2/M, z2.s, z10.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #-4, MUL VL]\n"
- "incw x24\n"
- "fmla z28.s, p2/M, z3.s, z9.s\n"
- "fmla z29.s, p2/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-3, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.s, p2/M, z4.s, z13.s\n"
- "fmla z29.s, p2/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.s, p2/M, z0.s, z5.s\n"
- "fmla z29.s, p2/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.s, p2/M, z0.s, z9.s\n"
- "fmla z31.s, p2/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-1, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x27, #4, MUL VL]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.s, p2/M, z1.s, z13.s\n"
- "fmla z31.s, p2/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p2/Z, [x27]\n"
- "fmla z28.s, p2/M, z2.s, z10.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z2.s, z5.s\n"
- "fmla z31.s, p2/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z3.s, z6.s\n"
- "fmla z31.s, p2/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z8.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.s, p2/M, z0.s, z9.s\n"
- "fmla z29.s, p2/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z0.s, z11.s\n"
- "fmla z31.s, p2/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "fmla z28.s, p2/M, z1.s, z13.s\n"
- "fmla z29.s, p2/M, z1.s, z5.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
- "fmla z30.s, p2/M, z1.s, z12.s\n"
- "fmla z31.s, p2/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
- "fmla z28.s, p2/M, z2.s, z5.s\n"
- "fmla z29.s, p2/M, z2.s, z6.s\n"
- "ld1w { z5.s }, p1/Z, [x10, x28, LSL #2]\n"
+ "fmla z27.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z31.s, p2/M, z1.s, z8.s\n"
+ "fmla z26.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z21.s }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.s, p2/M, z2.s, z9.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldp x26, x25, [x16, #0x10]\n"
- "fmla z28.s, p2/M, z3.s, z6.s\n"
- "fmla z29.s, p2/M, z3.s, z8.s\n"
- "ld1w { z6.s }, p1/Z, [x9, x28, LSL #2]\n"
- "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z27.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z20.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z31.s, p2/M, z2.s, z13.s\n"
+ "fmla z26.s, p2/M, z2.s, z5.s\n"
+ "ldr x22, [x16, #0x78]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ldp x10, x9, [x16, #0x40]\n"
- "fmla z28.s, p2/M, z4.s, z8.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "incw x13\n"
- "ld1w { z7.s }, p1/Z, [x26, x28, LSL #2]\n"
+ "fmla z27.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z31.s, p2/M, z3.s, z5.s\n"
+ "fmla z26.s, p2/M, z3.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z9.s\n"
- "ld1w { z8.s }, p1/Z, [x25, x28, LSL #2]\n"
- "ld1w { z9.s }, p1/Z, [x23, x28, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x22, x28, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x21, x28, LSL #2]\n"
- "fmax z28.s, p2/M, z28.s, z18.s\n"
- "fmax z29.s, p2/M, z29.s, z18.s\n"
- "ld1w { z12.s }, p1/Z, [x20, x28, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x10, x28, LSL #2]\n"
- "fmax z30.s, p2/M, z30.s, z18.s\n"
- "fmax z31.s, p2/M, z31.s, z18.s\n"
- "ld1w { z14.s }, p1/Z, [x9, x28, LSL #2]\n"
+ "fmla z27.s, p2/M, z4.s, z20.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z4.s, z22.s\n"
+ "fmla z26.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x23, [x16, #0x90]\n"
+ "fmla z30.s, p2/M, z21.s, z7.s\n"
+ "fmla z27.s, p2/M, z21.s, z8.s\n"
+ "ldr x26, [x16, #0x98]\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla z31.s, p2/M, z21.s, z14.s\n"
+ "fmla z26.s, p2/M, z21.s, z11.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.s, p2/M, z18.s, z8.s\n"
+ "fmla z27.s, p2/M, z18.s, z13.s\n"
+ "ld1w { z24.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z31.s, p2/M, z18.s, z11.s\n"
+ "fmla z26.s, p2/M, z18.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.s, p2/M, z17.s, z13.s\n"
+ "fmla z27.s, p2/M, z17.s, z5.s\n"
+ "ld1w { z3.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z31.s, p2/M, z17.s, z0.s\n"
+ "fmla z26.s, p2/M, z17.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.s, p2/M, z16.s, z5.s\n"
+ "fmla z27.s, p2/M, z16.s, z22.s\n"
+ "ld1w { z6.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x27, [x16, #0xc8]\n"
+ "fmla z31.s, p2/M, z16.s, z29.s\n"
+ "fmla z26.s, p2/M, z16.s, z3.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x23, [x16, #0xd0]\n"
+ "fmla z30.s, p2/M, z19.s, z22.s\n"
+ "fmla z27.s, p2/M, z19.s, z10.s\n"
+ "ld1w { z23.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z3.s\n"
+ "fmla z26.s, p2/M, z19.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x22, [x16, #0xd8]\n"
+ "fmla z30.s, p2/M, z25.s, z14.s\n"
+ "fmla z27.s, p2/M, z25.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z31.s, p2/M, z25.s, z6.s\n"
+ "fmla z26.s, p2/M, z25.s, z23.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.s, p2/M, z18.s, z11.s\n"
+ "fmla z27.s, p2/M, z18.s, z0.s\n"
+ "ld1w { z7.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z31.s, p2/M, z18.s, z23.s\n"
+ "fmla z26.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, #-5, MUL VL]\n"
+ "whilelt p1.s, x10, %x[n_channels]\n"
+ "fmla z30.s, p2/M, z17.s, z0.s\n"
+ "fmla z27.s, p2/M, z17.s, z29.s\n"
+ "ld1w { z19.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z31.s, p2/M, z17.s, z22.s\n"
+ "fmla z26.s, p2/M, z17.s, z7.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #-4, MUL VL]\n"
"incw x28\n"
- "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
+ "fmla z30.s, p2/M, z16.s, z29.s\n"
+ "fmla z27.s, p2/M, z16.s, z3.s\n"
+ "ld1w { z0.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x100]\n"
+ "fmla z31.s, p2/M, z16.s, z7.s\n"
+ "fmla z26.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.s, p2/M, z21.s, z3.s\n"
+ "fmla z27.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z11.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z21.s, z19.s\n"
+ "fmla z26.s, p2/M, z21.s, z1.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla z30.s, p2/M, z20.s, z6.s\n"
+ "fmla z27.s, p2/M, z20.s, z23.s\n"
+ "ld1w { z25.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla z31.s, p2/M, z20.s, z0.s\n"
+ "fmla z26.s, p2/M, z20.s, z11.s\n"
+ "ld1w { z8.s }, p2/Z, [x9, #-1, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "fmla z30.s, p2/M, z18.s, z23.s\n"
+ "fmla z27.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x118]\n"
+ "fmla z31.s, p2/M, z18.s, z11.s\n"
+ "fmla z26.s, p2/M, z18.s, z25.s\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p2/M, z17.s, z22.s\n"
+ "fmla z27.s, p2/M, z17.s, z7.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z17.s, z25.s\n"
+ "fmla z26.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z7.s\n"
+ "fmla z27.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z16.s, z24.s\n"
+ "fmla z26.s, p2/M, z16.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z10.s, z19.s\n"
+ "fmla z27.s, p2/M, z10.s, z1.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z10.s, z13.s\n"
+ "fmla z26.s, p2/M, z10.s, z22.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.s, p2/M, z8.s, z0.s\n"
+ "fmla z27.s, p2/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z8.s, z18.s\n"
+ "fmla z26.s, p2/M, z8.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "fmla z30.s, p2/M, z23.s, z11.s\n"
+ "fmla z27.s, p2/M, z23.s, z25.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "fmla z31.s, p2/M, z23.s, z17.s\n"
+ "fmla z26.s, p2/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z1.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "fmla z30.s, p2/M, z21.s, z25.s\n"
+ "fmla z27.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z5.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "fmla z31.s, p2/M, z21.s, z16.s\n"
+ "fmla z26.s, p2/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z27.s, p2/M, z20.s, z13.s\n"
+ "ld1w { z6.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "fmla z31.s, p2/M, z20.s, z18.s\n"
+ "fmla z26.s, p2/M, z20.s, z17.s\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "fmla z30.s, p2/M, z19.s, z13.s\n"
+ "fmla z27.s, p2/M, z19.s, z22.s\n"
+ "incw x13\n"
+ "ld1w { z7.s }, p1/Z, [x27, x10, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z17.s\n"
+ "fmla z26.s, p2/M, z19.s, z16.s\n"
+ "ld1w { z8.s }, p1/Z, [x26, x10, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x25, x10, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x24, x10, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x23, x10, LSL #2]\n"
+ "fmax z30.s, p2/M, z30.s, z15.s\n"
+ "fmax z27.s, p2/M, z27.s, z15.s\n"
+ "ld1w { z12.s }, p1/Z, [x22, x10, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "fmax z31.s, p2/M, z31.s, z15.s\n"
+ "fmax z26.s, p2/M, z26.s, z15.s\n"
+ "ld1w { z14.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "incw x10\n"
+ "ld1w { z2.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"whilelt p3.s, x13, %x[n_channels]\n"
- "cmp x28, %x[n_channels]\n"
- "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
- "fmin z28.s, p2/M, z28.s, z17.s\n"
- "fmin z29.s, p2/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x15, x24, LSL #2]\n"
- "fmin z30.s, p2/M, z30.s, z17.s\n"
- "fmin z31.s, p2/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x12, x24, LSL #2]\n"
- "addvl x27, x27, #-6\n"
- "st1w { z31.s }, p0, [x11, x24, LSL #2]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1w { z3.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "fmin z30.s, p2/M, z30.s, z28.s\n"
+ "fmin z27.s, p2/M, z27.s, z28.s\n"
+ "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+ "fmin z31.s, p2/M, z31.s, z28.s\n"
+ "fmin z26.s, p2/M, z26.s, z28.s\n"
+ "st1w { z27.s }, p0, [x14, x28, LSL #2]\n"
+ "st1w { z31.s }, p0, [x12, x28, LSL #2]\n"
+ "addvl x9, x9, #-6\n"
+ "st1w { z26.s }, p0, [x11, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z16\n fmla z28.s, p2/M, z0.s, z5.s\n"
- "movprfx z29, z16\n fmla z29.s, p2/M, z0.s, z6.s\n"
- "ldr x26, [x16, #0x50]\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "movprfx z30, z16\n fmla z30.s, p2/M, z0.s, z7.s\n"
- "movprfx z31, z16\n fmla z31.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x58]\n"
- "ldr x23, [x16, #0x60]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x22, [x16, #0x68]\n"
- "fmla z30.s, p2/M, z1.s, z8.s\n"
- "fmla z31.s, p2/M, z1.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.s, p2/M, z2.s, z9.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z30.s, p2/M, z2.s, z13.s\n"
- "fmla z31.s, p2/M, z2.s, z5.s\n"
- "ldr x20, [x16, #0x78]\n"
- "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla z30.s, p2/M, z3.s, z5.s\n"
- "fmla z31.s, p2/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z6.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #4, MUL VL]\n"
- "ldr x26, [x16, #0x90]\n"
- "fmla z28.s, p2/M, z0.s, z7.s\n"
+ "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z6.s\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z7.s\n"
"fmla z29.s, p2/M, z0.s, z8.s\n"
- "ldr x25, [x16, #0x98]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.s, p2/M, z0.s, z14.s\n"
- "fmla z31.s, p2/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.s, p2/M, z1.s, z8.s\n"
- "fmla z29.s, p2/M, z1.s, z13.s\n"
- "ld1w { z8.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z30.s, p2/M, z1.s, z11.s\n"
- "fmla z31.s, p2/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z28.s, p2/M, z2.s, z13.s\n"
- "fmla z29.s, p2/M, z2.s, z5.s\n"
- "ld1w { z13.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla z30.s, p2/M, z2.s, z12.s\n"
- "fmla z31.s, p2/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
- "addvl x27, x27, #16\n"
- "fmla z28.s, p2/M, z3.s, z5.s\n"
- "fmla z29.s, p2/M, z3.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla z30.s, p2/M, z3.s, z9.s\n"
- "fmla z31.s, p2/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
- "ldr x26, [x16, #0xd0]\n"
- "fmla z28.s, p2/M, z4.s, z6.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z13.s\n"
- "fmla z31.s, p2/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
- "ldr x25, [x16, #0xd8]\n"
- "fmla z28.s, p2/M, z0.s, z14.s\n"
- "fmla z29.s, p2/M, z0.s, z11.s\n"
- "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.s, p2/M, z0.s, z5.s\n"
- "fmla z31.s, p2/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-6, MUL VL]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z28.s, p2/M, z1.s, z11.s\n"
- "fmla z29.s, p2/M, z1.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x22, [x16, #0xe8]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
"fmla z30.s, p2/M, z1.s, z6.s\n"
- "fmla z31.s, p2/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x27, #-5, MUL VL]\n"
- "incw x24\n"
- "fmla z28.s, p2/M, z2.s, z12.s\n"
- "fmla z29.s, p2/M, z2.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z30.s, p2/M, z2.s, z10.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #-4, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z28.s, p2/M, z3.s, z9.s\n"
- "fmla z29.s, p2/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #-3, MUL VL]\n"
- "fmla z28.s, p2/M, z4.s, z13.s\n"
- "fmla z29.s, p2/M, z4.s, z8.s\n"
- "ld1w { z13.s }, p3/Z, [x9, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #-2, MUL VL]\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla z28.s, p2/M, z0.s, z5.s\n"
- "fmla z29.s, p2/M, z0.s, z6.s\n"
- "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ldr x26, [x16, #0x110]\n"
- "fmla z30.s, p2/M, z0.s, z9.s\n"
- "fmla z31.s, p2/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p2/Z, [x27, #-1, MUL VL]\n"
- "fmla z28.s, p2/M, z1.s, z6.s\n"
- "fmla z29.s, p2/M, z1.s, z10.s\n"
- "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x16, #0x118]\n"
- "fmla z30.s, p2/M, z1.s, z13.s\n"
- "fmla z31.s, p2/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p2/Z, [x27]\n"
- "fmla z28.s, p2/M, z2.s, z10.s\n"
- "fmla z29.s, p2/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z2.s, z5.s\n"
- "fmla z31.s, p2/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p2/Z, [x27, #1, MUL VL]\n"
- "fmla z28.s, p2/M, z3.s, z11.s\n"
- "fmla z29.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z3.s, z6.s\n"
- "fmla z31.s, p2/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p2/Z, [x27, #2, MUL VL]\n"
- "fmla z28.s, p2/M, z4.s, z12.s\n"
- "fmla z29.s, p2/M, z4.s, z14.s\n"
- "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z4.s, z8.s\n"
- "fmla z31.s, p2/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p2/Z, [x27, #3, MUL VL]\n"
- "fmla z28.s, p2/M, z0.s, z9.s\n"
- "fmla z29.s, p2/M, z0.s, z13.s\n"
- "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z0.s, z11.s\n"
- "fmla z31.s, p2/M, z0.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x9, x13, LSL #2]\n"
- "fmla z28.s, p2/M, z1.s, z13.s\n"
- "fmla z29.s, p2/M, z1.s, z5.s\n"
- "fmla z30.s, p2/M, z1.s, z12.s\n"
"fmla z31.s, p2/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p3/Z, [x26, x13, LSL #2]\n"
- "fmla z28.s, p2/M, z2.s, z5.s\n"
- "fmla z29.s, p2/M, z2.s, z6.s\n"
+ "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z5.s, p2/M, z1.s, z8.s\n"
+ "fmla z29.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
"fmla z30.s, p2/M, z2.s, z9.s\n"
"fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p2/M, z3.s, z6.s\n"
- "fmla z29.s, p2/M, z3.s, z8.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z5.s, p2/M, z2.s, z13.s\n"
+ "fmla z29.s, p2/M, z2.s, z22.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"fmla z30.s, p2/M, z3.s, z11.s\n"
"fmla z31.s, p2/M, z3.s, z12.s\n"
- "fmla z28.s, p2/M, z4.s, z8.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "fmax z28.s, p2/M, z28.s, z18.s\n"
- "fmax z29.s, p2/M, z29.s, z18.s\n"
+ "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "fmla z5.s, p2/M, z3.s, z22.s\n"
+ "fmla z29.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
"fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z9.s\n"
- "fmax z30.s, p2/M, z30.s, z18.s\n"
- "fmax z31.s, p2/M, z31.s, z18.s\n"
- "fmin z28.s, p2/M, z28.s, z17.s\n"
- "fmin z29.s, p2/M, z29.s, z17.s\n"
- "st1w { z28.s }, p0, [x15, x24, LSL #2]\n"
- "fmin z30.s, p2/M, z30.s, z17.s\n"
- "fmin z31.s, p2/M, z31.s, z17.s\n"
- "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x12, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x11, x24, LSL #2]\n"
+ "fmla z31.s, p2/M, z4.s, z16.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z4.s, z6.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.s, p2/M, z20.s, z7.s\n"
+ "fmla z31.s, p2/M, z20.s, z8.s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla z5.s, p2/M, z20.s, z14.s\n"
+ "fmla z29.s, p2/M, z20.s, z1.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.s, p2/M, z19.s, z8.s\n"
+ "fmla z31.s, p2/M, z19.s, z13.s\n"
+ "ld1w { z26.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z5.s, p2/M, z19.s, z1.s\n"
+ "fmla z29.s, p2/M, z19.s, z0.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.s, p2/M, z18.s, z13.s\n"
+ "fmla z31.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x23, [x16, #0xc0]\n"
+ "fmla z5.s, p2/M, z18.s, z0.s\n"
+ "fmla z29.s, p2/M, z18.s, z27.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.s, p2/M, z17.s, z22.s\n"
+ "fmla z31.s, p2/M, z17.s, z6.s\n"
+ "ld1w { z22.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x22, [x16, #0xc8]\n"
+ "fmla z5.s, p2/M, z17.s, z27.s\n"
+ "fmla z29.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla z30.s, p2/M, z16.s, z6.s\n"
+ "fmla z31.s, p2/M, z16.s, z10.s\n"
+ "ld1w { z19.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z24.s\n"
+ "fmla z29.s, p2/M, z16.s, z26.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla z30.s, p2/M, z21.s, z14.s\n"
+ "fmla z31.s, p2/M, z21.s, z1.s\n"
+ "ld1w { z17.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z5.s, p2/M, z21.s, z22.s\n"
+ "fmla z29.s, p2/M, z21.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.s, p2/M, z25.s, z1.s\n"
+ "fmla z31.s, p2/M, z25.s, z0.s\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.s, p2/M, z25.s, z19.s\n"
+ "fmla z29.s, p2/M, z25.s, z18.s\n"
+ "ld1w { z4.s }, p2/Z, [x9, #-5, MUL VL]\n"
+ "incw x28\n"
+ "fmla z30.s, p2/M, z23.s, z0.s\n"
+ "fmla z31.s, p2/M, z23.s, z27.s\n"
+ "ld1w { z8.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z5.s, p2/M, z23.s, z18.s\n"
+ "fmla z29.s, p2/M, z23.s, z9.s\n"
+ "ld1w { z6.s }, p2/Z, [x9, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.s, p2/M, z20.s, z27.s\n"
+ "fmla z31.s, p2/M, z20.s, z24.s\n"
+ "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla z5.s, p2/M, z20.s, z9.s\n"
+ "fmla z29.s, p2/M, z20.s, z8.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z24.s\n"
+ "fmla z31.s, p2/M, z16.s, z26.s\n"
+ "ld1w { z0.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z8.s\n"
+ "fmla z29.s, p2/M, z16.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla z30.s, p2/M, z21.s, z22.s\n"
+ "fmla z31.s, p2/M, z21.s, z19.s\n"
+ "ld1w { z26.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla z5.s, p2/M, z21.s, z10.s\n"
+ "fmla z29.s, p2/M, z21.s, z0.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #-1, MUL VL]\n"
+ "fmla z30.s, p2/M, z4.s, z19.s\n"
+ "fmla z31.s, p2/M, z4.s, z18.s\n"
+ "ld1w { z24.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla z5.s, p2/M, z4.s, z0.s\n"
+ "fmla z29.s, p2/M, z4.s, z26.s\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p2/M, z6.s, z18.s\n"
+ "fmla z31.s, p2/M, z6.s, z9.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z6.s, z26.s\n"
+ "fmla z29.s, p2/M, z6.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z11.s, z9.s\n"
+ "fmla z31.s, p2/M, z11.s, z8.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z11.s, z24.s\n"
+ "fmla z29.s, p2/M, z11.s, z27.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z8.s\n"
+ "fmla z31.s, p2/M, z16.s, z17.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z27.s\n"
+ "fmla z29.s, p2/M, z16.s, z22.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.s, p2/M, z25.s, z10.s\n"
+ "fmla z31.s, p2/M, z25.s, z0.s\n"
+ "ld1w { z16.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z25.s, z18.s\n"
+ "fmla z29.s, p2/M, z25.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z23.s, z0.s\n"
+ "fmla z31.s, p2/M, z23.s, z26.s\n"
+ "fmla z5.s, p2/M, z23.s, z17.s\n"
+ "fmla z29.s, p2/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z21.s, z26.s\n"
+ "fmla z31.s, p2/M, z21.s, z24.s\n"
+ "fmla z5.s, p2/M, z21.s, z16.s\n"
+ "fmla z29.s, p2/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z31.s, p2/M, z20.s, z27.s\n"
+ "fmla z5.s, p2/M, z20.s, z18.s\n"
+ "fmla z29.s, p2/M, z20.s, z17.s\n"
+ "fmla z30.s, p2/M, z19.s, z27.s\n"
+ "fmla z31.s, p2/M, z19.s, z22.s\n"
+ "fmax z30.s, p2/M, z30.s, z15.s\n"
+ "fmax z31.s, p2/M, z31.s, z15.s\n"
+ "fmla z5.s, p2/M, z19.s, z17.s\n"
+ "fmla z29.s, p2/M, z19.s, z16.s\n"
+ "fmax z5.s, p2/M, z5.s, z15.s\n"
+ "fmax z29.s, p2/M, z29.s, z15.s\n"
+ "fmin z30.s, p2/M, z30.s, z28.s\n"
+ "fmin z31.s, p2/M, z31.s, z28.s\n"
+ "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+ "fmin z5.s, p2/M, z5.s, z28.s\n"
+ "fmin z29.s, p2/M, z29.s, z28.s\n"
+ "st1w { z31.s }, p0, [x14, x28, LSL #2]\n"
+ "st1w { z5.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z29.s }, p0, [x11, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
index 62faca97a9..6b155fc855 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 204f36edca..d53daaa8a0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -55,9 +55,9 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
"2:" // Channel loop: Load bias: Done
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
- "subs x25, %x[n_points], #0x1\n"
+ "ldp x28, x27, [x10], #0x10\n"
+ "ldp x26, x25, [x10], #0x10\n"
+ "subs x9, %x[n_points], #0x1\n"
"ldp x24, x23, [x10], #0x10\n"
"ldp x22, x21, [x10], #0x10\n"
"mov z24.d, z23.d\n"
@@ -68,12 +68,12 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
"mov z28.d, z23.d\n"
"mov z29.d, z23.d\n"
- "ld1w { z14.s }, p0/Z, [x9, x11, LSL #2]\n"
- "ld1w { z15.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
"mov z30.d, z23.d\n"
"mov z31.d, z23.d\n"
- "ld1w { z16.s }, p0/Z, [x27, x11, LSL #2]\n"
- "ld1w { z17.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
"ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
"ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
@@ -82,9 +82,9 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
- "subs x25, x25, #0x1\n"
+ "ldp x28, x27, [x10], #0x10\n"
+ "ldp x26, x25, [x10], #0x10\n"
+ "subs x9, x9, #0x1\n"
"fmla z23.s, p1/M, z14.s, z0.s\n"
"ldp x24, x23, [x10], #0x10\n"
"ldp x22, x21, [x10], #0x10\n"
@@ -93,15 +93,15 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ldr x20, [x10], #0x8\n"
"fmla z26.s, p1/M, z17.s, z0.s\n"
"fmla z27.s, p1/M, z18.s, z0.s\n"
- "ld1w { z14.s }, p0/Z, [x9, x11, LSL #2]\n"
+ "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
"fmla z28.s, p1/M, z19.s, z0.s\n"
"fmla z29.s, p1/M, z20.s, z0.s\n"
- "ld1w { z15.s }, p0/Z, [x28, x11, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
"fmla z30.s, p1/M, z21.s, z0.s\n"
"fmla z31.s, p1/M, z22.s, z0.s\n"
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
- "ld1w { z17.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
"ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
"ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
index 8640343747..eb1b111c36 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 7ba0edd991..3a71baaf61 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -53,21 +53,21 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ldr x11, [%x[inptrs], #0x20]\n"
"ldr x10, [%x[inptrs], #0x28]\n"
"ldr x9, [%x[inptrs], #0x30]\n"
- "ld1w { z26.s }, p2/Z, [%x[params]]\n"
- "mov z25.d, z26.d\n"
- "mov z24.d, z26.d\n"
+ "ld1w { z24.s }, p2/Z, [%x[params]]\n"
+ "mov z21.d, z24.d\n"
+ "mov z25.d, z24.d\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
- "mov z23.d, z26.d\n"
- "mov z22.d, z26.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z26.d, z24.d\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "mov z21.d, z26.d\n"
- "mov z20.d, z26.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z20.d, z24.d\n"
"ldr x20, [%x[outptrs], #0x40]\n"
"ld1rqw { z2.s }, p1/Z, [x16]\n"
- "mov z19.d, z26.d\n"
- "mov z18.d, z26.d\n"
+ "mov z23.d, z24.d\n"
+ "mov z19.d, z24.d\n"
"ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
"ld1rqw { z4.s }, p1/Z, [x15]\n"
"ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
@@ -81,175 +81,175 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
"ld1rqw { z14.s }, p1/Z, [x9]\n"
"ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
- "ld1rw { z17.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z22.s }, p1/Z, [%x[clamps]]\n"
"ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
"ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
"1:" // Output channel complete vector loop
- "fmla z26.s, z31.s, z2.s[0]\n"
- "fmla z23.s, z31.s, z6.s[0]\n"
- "mov z0.d, z10.d\n"
+ "fmla z24.s, z31.s, z2.s[0]\n"
+ "fmla z27.s, z31.s, z6.s[0]\n"
+ "mov z1.d, z10.d\n"
"incw x17\n"
- "fmla z22.s, z31.s, z6.s[2]\n"
- "fmla z21.s, z31.s, z7.s[0]\n"
- "mov z1.d, z11.d\n"
+ "fmla z26.s, z31.s, z6.s[2]\n"
+ "fmla z28.s, z31.s, z7.s[0]\n"
+ "mov z0.d, z11.d\n"
"mov p0.b, p2.b\n"
- "fmla z25.s, z31.s, z2.s[2]\n"
- "fmla z24.s, z31.s, z3.s[0]\n"
+ "fmla z21.s, z31.s, z2.s[2]\n"
+ "fmla z25.s, z31.s, z3.s[0]\n"
"whilelt p2.s, x17, %x[channel_multiplier]\n"
- "fmla z20.s, z31.s, z0.s[0]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z1.s[0]\n"
- "fmla z26.s, z30.s, z2.s[1]\n"
- "ld1w { z31.s }, p1/Z, [%x[params]]\n"
- "fmla z23.s, z30.s, z6.s[1]\n"
- "fmla z22.s, z30.s, z6.s[3]\n"
- "fmla z21.s, z30.s, z7.s[1]\n"
- "fmla z25.s, z30.s, z2.s[3]\n"
- "fmla z24.s, z30.s, z3.s[1]\n"
- "fmla z20.s, z30.s, z0.s[1]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[1]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "fmla z26.s, z29.s, z2.s[2]\n"
- "fmla z23.s, z29.s, z6.s[2]\n"
- "fmla z22.s, z29.s, z7.s[0]\n"
- "fmla z21.s, z29.s, z7.s[2]\n"
- "fmla z25.s, z29.s, z3.s[0]\n"
- "fmla z24.s, z29.s, z3.s[2]\n"
- "fmla z20.s, z29.s, z0.s[2]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "mov z0.d, z8.d\n"
- "fmla z18.s, z29.s, z1.s[2]\n"
- "mov z1.d, z9.d\n"
- "fmla z26.s, z31.s, z4.s[0]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z20.s, z31.s, z1.s[0]\n"
+ "fmla z23.s, z31.s, z1.s[2]\n"
+ "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z24.s, z30.s, z2.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params]]\n"
+ "fmla z27.s, z30.s, z6.s[1]\n"
+ "fmla z26.s, z30.s, z6.s[3]\n"
+ "fmla z28.s, z30.s, z7.s[1]\n"
+ "fmla z21.s, z30.s, z2.s[3]\n"
+ "fmla z25.s, z30.s, z3.s[1]\n"
+ "fmla z20.s, z30.s, z1.s[1]\n"
+ "fmla z23.s, z30.s, z1.s[3]\n"
+ "fmla z19.s, z30.s, z0.s[1]\n"
+ "ld1w { z17.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z24.s, z29.s, z2.s[2]\n"
+ "fmla z27.s, z29.s, z6.s[2]\n"
+ "fmla z26.s, z29.s, z7.s[0]\n"
+ "fmla z28.s, z29.s, z7.s[2]\n"
+ "fmla z21.s, z29.s, z3.s[0]\n"
+ "fmla z25.s, z29.s, z3.s[2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z18.s, z4.s[0]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z27.s, z18.s, z1.s[0]\n"
+ "fmla z26.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z12.d\n"
+ "fmla z28.s, z18.s, z0.s[0]\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z18.s, z4.s[2]\n"
+ "fmla z25.s, z18.s, z5.s[0]\n"
+ "fmla z20.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[2]\n"
+ "fmla z19.s, z18.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z17.s, z4.s[1]\n"
+ "fmla z27.s, z17.s, z1.s[1]\n"
+ "fmla z26.s, z17.s, z1.s[3]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z17.s, z4.s[3]\n"
+ "fmla z25.s, z17.s, z5.s[1]\n"
+ "fmla z20.s, z17.s, z1.s[1]\n"
+ "fmla z23.s, z17.s, z1.s[3]\n"
+ "mov z1.d, z8.d\n"
+ "fmla z19.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z31.s, z4.s[2]\n"
+ "ld1w { z17.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z27.s, z31.s, z1.s[2]\n"
+ "fmla z26.s, z31.s, z0.s[0]\n"
+ "mov z1.d, z12.d\n"
+ "fmla z28.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z31.s, z5.s[0]\n"
+ "fmla z25.s, z31.s, z5.s[2]\n"
+ "fmla z20.s, z31.s, z1.s[2]\n"
+ "mov z1.d, z10.d\n"
"fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z22.s, z31.s, z0.s[2]\n"
- "mov z0.d, z12.d\n"
- "fmla z21.s, z31.s, z1.s[0]\n"
- "mov z1.d, z13.d\n"
- "fmla z25.s, z31.s, z4.s[2]\n"
- "fmla z24.s, z31.s, z5.s[0]\n"
- "fmla z20.s, z31.s, z0.s[0]\n"
"fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z1.s[0]\n"
- "mov z0.d, z8.d\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "mov z1.d, z9.d\n"
- "fmla z26.s, z30.s, z4.s[1]\n"
- "fmla z23.s, z30.s, z0.s[1]\n"
- "fmla z22.s, z30.s, z0.s[3]\n"
- "fmla z21.s, z30.s, z1.s[1]\n"
- "mov z0.d, z12.d\n"
- "mov z1.d, z13.d\n"
- "fmla z25.s, z30.s, z4.s[3]\n"
- "fmla z24.s, z30.s, z5.s[1]\n"
- "fmla z20.s, z30.s, z0.s[1]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "mov z0.d, z8.d\n"
- "fmla z18.s, z30.s, z1.s[1]\n"
- "mov z1.d, z9.d\n"
- "fmla z26.s, z29.s, z4.s[2]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "fmla z23.s, z29.s, z0.s[2]\n"
- "fmla z22.s, z29.s, z1.s[0]\n"
- "mov z0.d, z12.d\n"
- "fmla z21.s, z29.s, z1.s[2]\n"
- "mov z1.d, z13.d\n"
- "fmla z25.s, z29.s, z5.s[0]\n"
- "fmla z24.s, z29.s, z5.s[2]\n"
- "fmla z20.s, z29.s, z0.s[2]\n"
- "mov z0.d, z10.d\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[2]\n"
- "mov z1.d, z11.d\n"
+ "mov z0.d, z11.d\n"
"ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
- "fmla z26.s, z31.s, z6.s[0]\n"
- "fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z22.s, z31.s, z0.s[2]\n"
- "fmla z21.s, z31.s, z1.s[0]\n"
- "mov z0.d, z14.d\n"
- "mov z1.d, z15.d\n"
- "fmla z25.s, z31.s, z6.s[2]\n"
- "fmla z24.s, z31.s, z7.s[0]\n"
- "fmla z20.s, z31.s, z0.s[0]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "mov z0.d, z10.d\n"
- "fmla z18.s, z31.s, z1.s[0]\n"
- "mov z1.d, z11.d\n"
- "fmla z26.s, z30.s, z6.s[1]\n"
+ "fmla z24.s, z18.s, z6.s[0]\n"
+ "fmla z27.s, z18.s, z1.s[0]\n"
+ "fmla z26.s, z18.s, z1.s[2]\n"
+ "fmla z28.s, z18.s, z0.s[0]\n"
+ "mov z1.d, z14.d\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z18.s, z6.s[2]\n"
+ "fmla z25.s, z18.s, z7.s[0]\n"
+ "fmla z20.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z10.d\n"
+ "fmla z19.s, z18.s, z0.s[0]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z24.s, z17.s, z6.s[1]\n"
"ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "fmla z23.s, z30.s, z0.s[1]\n"
- "fmla z22.s, z30.s, z0.s[3]\n"
- "mov z0.d, z14.d\n"
- "fmla z21.s, z30.s, z1.s[1]\n"
- "mov z1.d, z15.d\n"
- "fmla z25.s, z30.s, z6.s[3]\n"
- "fmla z24.s, z30.s, z7.s[1]\n"
- "fmla z20.s, z30.s, z0.s[1]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[1]\n"
- "mov z0.d, z10.d\n"
- "mov z1.d, z11.d\n"
- "fmla z26.s, z29.s, z6.s[2]\n"
- "fmla z23.s, z29.s, z0.s[2]\n"
- "fmin z26.s, p1/M, z26.s, z16.s\n"
- "fmla z22.s, z29.s, z1.s[0]\n"
- "fmla z21.s, z29.s, z1.s[2]\n"
- "mov z0.d, z14.d\n"
- "fmax z26.s, p1/M, z26.s, z17.s\n"
- "mov z1.d, z15.d\n"
- "fmla z25.s, z29.s, z7.s[0]\n"
- "fmla z24.s, z29.s, z7.s[2]\n"
- "fmin z25.s, p1/M, z25.s, z16.s\n"
- "fmla z20.s, z29.s, z0.s[2]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmla z27.s, z17.s, z1.s[1]\n"
+ "fmla z26.s, z17.s, z1.s[3]\n"
+ "mov z1.d, z14.d\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z17.s, z6.s[3]\n"
+ "fmla z25.s, z17.s, z7.s[1]\n"
+ "fmla z20.s, z17.s, z1.s[1]\n"
+ "fmla z23.s, z17.s, z1.s[3]\n"
+ "fmla z19.s, z17.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z24.s, z29.s, z6.s[2]\n"
+ "fmla z27.s, z29.s, z1.s[2]\n"
"fmin z24.s, p1/M, z24.s, z16.s\n"
- "fmin z23.s, p1/M, z23.s, z16.s\n"
- "fmla z18.s, z29.s, z1.s[2]\n"
- "fmin z22.s, p1/M, z22.s, z16.s\n"
+ "fmla z26.s, z29.s, z0.s[0]\n"
+ "fmla z28.s, z29.s, z0.s[2]\n"
+ "mov z1.d, z14.d\n"
+ "fmax z24.s, p1/M, z24.s, z22.s\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z29.s, z7.s[0]\n"
+ "fmla z25.s, z29.s, z7.s[2]\n"
"fmin z21.s, p1/M, z21.s, z16.s\n"
- "st1w { z26.s }, p0, [x28, x12, LSL #2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[0]\n"
+ "fmin z25.s, p1/M, z25.s, z16.s\n"
+ "fmin z27.s, p1/M, z27.s, z16.s\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmin z26.s, p1/M, z26.s, z16.s\n"
+ "fmin z28.s, p1/M, z28.s, z16.s\n"
+ "st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
"fmin z20.s, p1/M, z20.s, z16.s\n"
+ "fmin z23.s, p1/M, z23.s, z16.s\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
"fmin z19.s, p1/M, z19.s, z16.s\n"
- "ld1w { z26.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "fmin z18.s, p1/M, z18.s, z16.s\n"
"addvl %x[params], %x[params], #16\n"
"ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "fmax z25.s, p1/M, z25.s, z17.s\n"
- "fmax z24.s, p1/M, z24.s, z17.s\n"
- "st1w { z25.s }, p0, [x27, x12, LSL #2]\n"
- "mov z25.d, z26.d\n"
- "fmax z23.s, p1/M, z23.s, z17.s\n"
- "fmax z22.s, p1/M, z22.s, z17.s\n"
- "st1w { z24.s }, p0, [x26, x12, LSL #2]\n"
- "mov z24.d, z26.d\n"
- "fmax z21.s, p1/M, z21.s, z17.s\n"
- "fmax z20.s, p1/M, z20.s, z17.s\n"
- "st1w { z23.s }, p0, [x25, x12, LSL #2]\n"
- "mov z23.d, z26.d\n"
- "fmax z19.s, p1/M, z19.s, z17.s\n"
- "fmax z18.s, p1/M, z18.s, z17.s\n"
- "st1w { z22.s }, p0, [x24, x12, LSL #2]\n"
- "mov z22.d, z26.d\n"
- "st1w { z21.s }, p0, [x23, x12, LSL #2]\n"
- "mov z21.d, z26.d\n"
+ "fmax z21.s, p1/M, z21.s, z22.s\n"
+ "fmax z25.s, p1/M, z25.s, z22.s\n"
+ "st1w { z21.s }, p0, [x27, x12, LSL #2]\n"
+ "mov z21.d, z24.d\n"
+ "fmax z27.s, p1/M, z27.s, z22.s\n"
+ "fmax z26.s, p1/M, z26.s, z22.s\n"
+ "st1w { z25.s }, p0, [x26, x12, LSL #2]\n"
+ "mov z25.d, z24.d\n"
+ "fmax z28.s, p1/M, z28.s, z22.s\n"
+ "fmax z20.s, p1/M, z20.s, z22.s\n"
+ "st1w { z27.s }, p0, [x25, x12, LSL #2]\n"
+ "mov z27.d, z24.d\n"
+ "fmax z23.s, p1/M, z23.s, z22.s\n"
+ "fmax z19.s, p1/M, z19.s, z22.s\n"
+ "st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
+ "mov z26.d, z24.d\n"
+ "st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
+ "mov z28.d, z24.d\n"
"addvl %x[params], %x[params], #-6\n"
"st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
- "mov z20.d, z26.d\n"
- "st1w { z19.s }, p0, [x21, x12, LSL #2]\n"
- "mov z19.d, z26.d\n"
- "st1w { z18.s }, p0, [x20, x12, LSL #2]\n"
+ "mov z20.d, z24.d\n"
+ "st1w { z23.s }, p0, [x21, x12, LSL #2]\n"
+ "mov z23.d, z24.d\n"
+ "st1w { z19.s }, p0, [x20, x12, LSL #2]\n"
"incw x12\n"
- "mov z18.d, z26.d\n"
+ "mov z19.d, z24.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
index a4ee87cce2..cc0c4236a8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index 2ea116fc9e..84ab4b5035 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -52,21 +52,21 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"mov x10, #0x0\n"
"ldr x9, [%x[inptrs], #0x20]\n"
"ldr x28, [%x[inptrs], #0x28]\n"
- "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z16.s }, p2/Z, [%x[params]]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "mov z24.d, z25.d\n"
- "mov z23.d, z25.d\n"
+ "mov z25.d, z16.d\n"
+ "mov z15.d, z16.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- "mov z22.d, z25.d\n"
- "mov z21.d, z25.d\n"
+ "mov z24.d, z16.d\n"
+ "mov z14.d, z16.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"ld1rqw { z2.s }, p1/Z, [x14]\n"
- "mov z20.d, z25.d\n"
- "mov z19.d, z25.d\n"
+ "mov z26.d, z16.d\n"
+ "mov z17.d, z16.d\n"
"ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
"ld1rqw { z4.s }, p1/Z, [x13]\n"
- "mov z18.d, z25.d\n"
+ "mov z23.d, z16.d\n"
"ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
"ld1rqw { z6.s }, p1/Z, [x12]\n"
"ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
@@ -76,8 +76,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
"ld1rqw { z12.s }, p1/Z, [x28]\n"
"ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
- "ld1rw { z17.s }, p1/Z, [%x[clamps]]\n"
- "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z22.s }, p1/Z, [%x[clamps], #4]\n"
"ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
@@ -85,304 +85,304 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #6\n"
"1:" // Output channel complete vector loop
- "fmla z25.s, z31.s, z2.s[0]\n"
- "fmla z24.s, z31.s, z2.s[1]\n"
+ "fmla z16.s, z31.s, z2.s[0]\n"
+ "fmla z25.s, z31.s, z2.s[1]\n"
"mov z0.d, z8.d\n"
"incw x15\n"
- "fmla z23.s, z31.s, z2.s[2]\n"
- "fmla z22.s, z31.s, z2.s[3]\n"
+ "fmla z15.s, z31.s, z2.s[2]\n"
+ "fmla z24.s, z31.s, z2.s[3]\n"
"mov z1.d, z9.d\n"
"mov p0.b, p2.b\n"
- "fmla z21.s, z31.s, z4.s[0]\n"
- "fmla z20.s, z31.s, z4.s[1]\n"
+ "fmla z14.s, z31.s, z4.s[0]\n"
+ "fmla z26.s, z31.s, z4.s[1]\n"
"whilelt p2.s, x15, %x[channel_multiplier]\n"
- "fmla z19.s, z31.s, z4.s[2]\n"
- "fmla z18.s, z31.s, z4.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params]]\n"
- "fmla z25.s, z30.s, z2.s[1]\n"
- "fmla z24.s, z30.s, z2.s[2]\n"
- "fmla z23.s, z30.s, z2.s[3]\n"
- "fmla z22.s, z30.s, z3.s[0]\n"
- "fmla z21.s, z30.s, z4.s[1]\n"
- "fmla z20.s, z30.s, z4.s[2]\n"
- "fmla z19.s, z30.s, z4.s[3]\n"
- "fmla z18.s, z30.s, z5.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "fmla z25.s, z29.s, z2.s[2]\n"
- "fmla z24.s, z29.s, z2.s[3]\n"
- "fmla z23.s, z29.s, z3.s[0]\n"
- "fmla z22.s, z29.s, z3.s[1]\n"
- "fmla z21.s, z29.s, z4.s[2]\n"
- "fmla z20.s, z29.s, z4.s[3]\n"
- "fmla z19.s, z29.s, z5.s[0]\n"
- "fmla z18.s, z29.s, z5.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "fmla z25.s, z28.s, z2.s[3]\n"
- "fmla z24.s, z28.s, z3.s[0]\n"
- "fmla z23.s, z28.s, z3.s[1]\n"
- "fmla z22.s, z28.s, z3.s[2]\n"
- "fmla z21.s, z28.s, z4.s[3]\n"
- "fmla z20.s, z28.s, z5.s[0]\n"
- "fmla z19.s, z28.s, z5.s[1]\n"
- "fmla z18.s, z28.s, z5.s[2]\n"
+ "fmla z17.s, z31.s, z4.s[2]\n"
+ "fmla z23.s, z31.s, z4.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params]]\n"
+ "fmla z16.s, z30.s, z2.s[1]\n"
+ "fmla z25.s, z30.s, z2.s[2]\n"
+ "fmla z15.s, z30.s, z2.s[3]\n"
+ "fmla z24.s, z30.s, z3.s[0]\n"
+ "fmla z14.s, z30.s, z4.s[1]\n"
+ "fmla z26.s, z30.s, z4.s[2]\n"
+ "fmla z17.s, z30.s, z4.s[3]\n"
+ "fmla z23.s, z30.s, z5.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z25.s, z29.s, z2.s[3]\n"
+ "fmla z15.s, z29.s, z3.s[0]\n"
+ "fmla z24.s, z29.s, z3.s[1]\n"
+ "fmla z14.s, z29.s, z4.s[2]\n"
+ "fmla z26.s, z29.s, z4.s[3]\n"
+ "fmla z17.s, z29.s, z5.s[0]\n"
+ "fmla z23.s, z29.s, z5.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z16.s, z28.s, z2.s[3]\n"
+ "fmla z25.s, z28.s, z3.s[0]\n"
+ "fmla z15.s, z28.s, z3.s[1]\n"
+ "fmla z24.s, z28.s, z3.s[2]\n"
+ "fmla z14.s, z28.s, z4.s[3]\n"
+ "fmla z26.s, z28.s, z5.s[0]\n"
+ "fmla z17.s, z28.s, z5.s[1]\n"
+ "fmla z23.s, z28.s, z5.s[2]\n"
"ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "fmla z25.s, z27.s, z3.s[0]\n"
- "fmla z24.s, z27.s, z3.s[1]\n"
- "fmla z23.s, z27.s, z3.s[2]\n"
- "fmla z22.s, z27.s, z3.s[3]\n"
- "fmla z21.s, z27.s, z5.s[0]\n"
- "fmla z20.s, z27.s, z5.s[1]\n"
- "fmla z19.s, z27.s, z5.s[2]\n"
- "fmla z18.s, z27.s, z5.s[3]\n"
+ "fmla z16.s, z27.s, z3.s[0]\n"
+ "fmla z25.s, z27.s, z3.s[1]\n"
+ "fmla z15.s, z27.s, z3.s[2]\n"
+ "fmla z24.s, z27.s, z3.s[3]\n"
+ "fmla z14.s, z27.s, z5.s[0]\n"
+ "fmla z26.s, z27.s, z5.s[1]\n"
+ "fmla z17.s, z27.s, z5.s[2]\n"
+ "fmla z23.s, z27.s, z5.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "fmla z25.s, z31.s, z4.s[0]\n"
- "fmla z24.s, z31.s, z4.s[1]\n"
- "fmla z23.s, z31.s, z4.s[2]\n"
- "fmla z22.s, z31.s, z4.s[3]\n"
- "fmla z21.s, z31.s, z6.s[0]\n"
- "fmla z20.s, z31.s, z6.s[1]\n"
- "fmla z19.s, z31.s, z6.s[2]\n"
- "fmla z18.s, z31.s, z6.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
- "fmla z25.s, z30.s, z4.s[1]\n"
- "fmla z24.s, z30.s, z4.s[2]\n"
- "fmla z23.s, z30.s, z4.s[3]\n"
- "fmla z22.s, z30.s, z5.s[0]\n"
- "fmla z21.s, z30.s, z6.s[1]\n"
- "fmla z20.s, z30.s, z6.s[2]\n"
- "fmla z19.s, z30.s, z6.s[3]\n"
- "fmla z18.s, z30.s, z7.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "fmla z25.s, z29.s, z4.s[2]\n"
- "fmla z24.s, z29.s, z4.s[3]\n"
- "fmla z23.s, z29.s, z5.s[0]\n"
- "fmla z22.s, z29.s, z5.s[1]\n"
- "fmla z21.s, z29.s, z6.s[2]\n"
- "fmla z20.s, z29.s, z6.s[3]\n"
- "fmla z19.s, z29.s, z7.s[0]\n"
- "fmla z18.s, z29.s, z7.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z16.s, z20.s, z4.s[0]\n"
+ "fmla z25.s, z20.s, z4.s[1]\n"
+ "fmla z15.s, z20.s, z4.s[2]\n"
+ "fmla z24.s, z20.s, z4.s[3]\n"
+ "fmla z14.s, z20.s, z6.s[0]\n"
+ "fmla z26.s, z20.s, z6.s[1]\n"
+ "fmla z17.s, z20.s, z6.s[2]\n"
+ "fmla z23.s, z20.s, z6.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z16.s, z19.s, z4.s[1]\n"
+ "fmla z25.s, z19.s, z4.s[2]\n"
+ "fmla z15.s, z19.s, z4.s[3]\n"
+ "fmla z24.s, z19.s, z5.s[0]\n"
+ "fmla z14.s, z19.s, z6.s[1]\n"
+ "fmla z26.s, z19.s, z6.s[2]\n"
+ "fmla z17.s, z19.s, z6.s[3]\n"
+ "fmla z23.s, z19.s, z7.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z16.s, z18.s, z4.s[2]\n"
+ "fmla z25.s, z18.s, z4.s[3]\n"
+ "fmla z15.s, z18.s, z5.s[0]\n"
+ "fmla z24.s, z18.s, z5.s[1]\n"
+ "fmla z14.s, z18.s, z6.s[2]\n"
+ "fmla z26.s, z18.s, z6.s[3]\n"
+ "fmla z17.s, z18.s, z7.s[0]\n"
+ "fmla z23.s, z18.s, z7.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmla z25.s, z28.s, z4.s[3]\n"
- "fmla z24.s, z28.s, z5.s[0]\n"
- "fmla z23.s, z28.s, z5.s[1]\n"
- "fmla z22.s, z28.s, z5.s[2]\n"
- "fmla z21.s, z28.s, z6.s[3]\n"
- "fmla z20.s, z28.s, z7.s[0]\n"
- "fmla z19.s, z28.s, z7.s[1]\n"
- "fmla z18.s, z28.s, z7.s[2]\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "fmla z25.s, z27.s, z5.s[0]\n"
- "fmla z24.s, z27.s, z5.s[1]\n"
- "fmla z23.s, z27.s, z5.s[2]\n"
- "fmla z22.s, z27.s, z5.s[3]\n"
- "fmla z21.s, z27.s, z7.s[0]\n"
- "fmla z20.s, z27.s, z7.s[1]\n"
- "fmla z19.s, z27.s, z7.s[2]\n"
- "fmla z18.s, z27.s, z7.s[3]\n"
+ "fmla z16.s, z28.s, z4.s[3]\n"
+ "fmla z25.s, z28.s, z5.s[0]\n"
+ "fmla z15.s, z28.s, z5.s[1]\n"
+ "fmla z24.s, z28.s, z5.s[2]\n"
+ "fmla z14.s, z28.s, z6.s[3]\n"
+ "fmla z26.s, z28.s, z7.s[0]\n"
+ "fmla z17.s, z28.s, z7.s[1]\n"
+ "fmla z23.s, z28.s, z7.s[2]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z16.s, z27.s, z5.s[0]\n"
+ "fmla z25.s, z27.s, z5.s[1]\n"
+ "fmla z15.s, z27.s, z5.s[2]\n"
+ "fmla z24.s, z27.s, z5.s[3]\n"
+ "fmla z14.s, z27.s, z7.s[0]\n"
+ "fmla z26.s, z27.s, z7.s[1]\n"
+ "fmla z17.s, z27.s, z7.s[2]\n"
+ "fmla z23.s, z27.s, z7.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "fmla z25.s, z31.s, z6.s[0]\n"
- "fmla z24.s, z31.s, z6.s[1]\n"
- "fmla z23.s, z31.s, z6.s[2]\n"
- "fmla z22.s, z31.s, z6.s[3]\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[1]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z0.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "fmla z25.s, z30.s, z6.s[1]\n"
- "fmla z24.s, z30.s, z6.s[2]\n"
- "fmla z23.s, z30.s, z6.s[3]\n"
- "fmla z22.s, z30.s, z7.s[0]\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[2]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "fmla z25.s, z29.s, z6.s[2]\n"
- "fmla z24.s, z29.s, z6.s[3]\n"
- "fmla z23.s, z29.s, z7.s[0]\n"
- "fmla z22.s, z29.s, z7.s[1]\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
- "fmla z20.s, z29.s, z0.s[3]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "fmla z25.s, z28.s, z6.s[3]\n"
- "fmla z24.s, z28.s, z7.s[0]\n"
- "fmla z23.s, z28.s, z7.s[1]\n"
- "fmla z22.s, z28.s, z7.s[2]\n"
- "fmla z21.s, z28.s, z0.s[3]\n"
- "fmla z20.s, z28.s, z1.s[0]\n"
- "fmla z19.s, z28.s, z1.s[1]\n"
- "fmla z18.s, z28.s, z1.s[2]\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "fmla z25.s, z27.s, z7.s[0]\n"
- "fmla z24.s, z27.s, z7.s[1]\n"
- "fmla z23.s, z27.s, z7.s[2]\n"
- "fmla z22.s, z27.s, z7.s[3]\n"
- "fmla z21.s, z27.s, z1.s[0]\n"
- "fmla z20.s, z27.s, z1.s[1]\n"
- "fmla z19.s, z27.s, z1.s[2]\n"
- "fmla z18.s, z27.s, z1.s[3]\n"
+ "fmla z16.s, z20.s, z6.s[0]\n"
+ "fmla z25.s, z20.s, z6.s[1]\n"
+ "fmla z15.s, z20.s, z6.s[2]\n"
+ "fmla z24.s, z20.s, z6.s[3]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "fmla z16.s, z19.s, z6.s[1]\n"
+ "fmla z25.s, z19.s, z6.s[2]\n"
+ "fmla z15.s, z19.s, z6.s[3]\n"
+ "fmla z24.s, z19.s, z7.s[0]\n"
+ "fmla z14.s, z19.s, z0.s[1]\n"
+ "fmla z26.s, z19.s, z0.s[2]\n"
+ "fmla z17.s, z19.s, z0.s[3]\n"
+ "fmla z23.s, z19.s, z1.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "fmla z16.s, z18.s, z6.s[2]\n"
+ "fmla z25.s, z18.s, z6.s[3]\n"
+ "fmla z15.s, z18.s, z7.s[0]\n"
+ "fmla z24.s, z18.s, z7.s[1]\n"
+ "fmla z14.s, z18.s, z0.s[2]\n"
+ "fmla z26.s, z18.s, z0.s[3]\n"
+ "fmla z17.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "fmla z16.s, z30.s, z6.s[3]\n"
+ "fmla z25.s, z30.s, z7.s[0]\n"
+ "fmla z15.s, z30.s, z7.s[1]\n"
+ "fmla z24.s, z30.s, z7.s[2]\n"
+ "fmla z14.s, z30.s, z0.s[3]\n"
+ "fmla z26.s, z30.s, z1.s[0]\n"
+ "fmla z17.s, z30.s, z1.s[1]\n"
+ "fmla z23.s, z30.s, z1.s[2]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "fmla z16.s, z27.s, z7.s[0]\n"
+ "fmla z25.s, z27.s, z7.s[1]\n"
+ "fmla z15.s, z27.s, z7.s[2]\n"
+ "fmla z24.s, z27.s, z7.s[3]\n"
+ "fmla z14.s, z27.s, z1.s[0]\n"
+ "fmla z26.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z23.s, z27.s, z1.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "fmla z25.s, z31.s, z0.s[0]\n"
- "fmla z24.s, z31.s, z0.s[1]\n"
- "fmla z23.s, z31.s, z0.s[2]\n"
- "fmla z22.s, z31.s, z0.s[3]\n"
- "mov z0.d, z10.d\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[1]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z0.s[3]\n"
- "mov z0.d, z8.d\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "fmla z25.s, z30.s, z0.s[1]\n"
- "fmla z24.s, z30.s, z0.s[2]\n"
- "fmla z23.s, z30.s, z0.s[3]\n"
- "fmla z22.s, z30.s, z1.s[0]\n"
- "mov z0.d, z10.d\n"
- "mov z1.d, z11.d\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[2]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[0]\n"
- "mov z0.d, z8.d\n"
- "ld1w { z30.s }, p1/Z, [%x[params]]\n"
- "mov z1.d, z9.d\n"
- "fmla z25.s, z29.s, z0.s[2]\n"
- "fmla z24.s, z29.s, z0.s[3]\n"
- "fmla z23.s, z29.s, z1.s[0]\n"
- "fmla z22.s, z29.s, z1.s[1]\n"
+ "fmla z16.s, z20.s, z0.s[0]\n"
+ "fmla z25.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z0.s[2]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
- "mov z1.d, z11.d\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
- "fmla z20.s, z29.s, z0.s[3]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[1]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
"mov z0.d, z8.d\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mov z1.d, z9.d\n"
- "fmla z25.s, z28.s, z0.s[3]\n"
- "fmla z24.s, z28.s, z1.s[0]\n"
- "fmla z23.s, z28.s, z1.s[1]\n"
- "fmla z22.s, z28.s, z1.s[2]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "fmla z16.s, z19.s, z0.s[1]\n"
+ "fmla z25.s, z19.s, z0.s[2]\n"
+ "fmla z15.s, z19.s, z0.s[3]\n"
+ "fmla z24.s, z19.s, z1.s[0]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z19.s, z1.s[1]\n"
+ "fmla z26.s, z19.s, z1.s[2]\n"
+ "fmla z17.s, z19.s, z1.s[3]\n"
+ "fmla z23.s, z19.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z19.s }, p1/Z, [%x[params]]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z16.s, z18.s, z1.s[2]\n"
+ "fmla z25.s, z18.s, z1.s[3]\n"
+ "fmla z15.s, z18.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z18.s, z1.s[2]\n"
+ "fmla z26.s, z18.s, z1.s[3]\n"
+ "fmla z17.s, z18.s, z0.s[0]\n"
+ "fmla z23.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z16.s, z31.s, z1.s[3]\n"
+ "fmla z25.s, z31.s, z0.s[0]\n"
+ "fmla z15.s, z31.s, z0.s[1]\n"
+ "fmla z24.s, z31.s, z0.s[2]\n"
"mov z0.d, z10.d\n"
"mov z1.d, z11.d\n"
- "fmla z21.s, z28.s, z0.s[3]\n"
- "fmla z20.s, z28.s, z1.s[0]\n"
- "fmla z19.s, z28.s, z1.s[1]\n"
- "fmla z18.s, z28.s, z1.s[2]\n"
+ "fmla z14.s, z31.s, z0.s[3]\n"
+ "fmla z26.s, z31.s, z1.s[0]\n"
+ "fmla z17.s, z31.s, z1.s[1]\n"
+ "fmla z23.s, z31.s, z1.s[2]\n"
"mov z1.d, z9.d\n"
"ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "fmla z25.s, z27.s, z1.s[0]\n"
- "fmla z24.s, z27.s, z1.s[1]\n"
- "fmla z23.s, z27.s, z1.s[2]\n"
- "fmla z22.s, z27.s, z1.s[3]\n"
+ "fmla z16.s, z27.s, z1.s[0]\n"
+ "fmla z25.s, z27.s, z1.s[1]\n"
+ "fmla z15.s, z27.s, z1.s[2]\n"
+ "fmla z24.s, z27.s, z1.s[3]\n"
"mov z1.d, z11.d\n"
- "fmla z21.s, z27.s, z1.s[0]\n"
- "fmla z20.s, z27.s, z1.s[1]\n"
- "fmla z19.s, z27.s, z1.s[2]\n"
- "fmla z18.s, z27.s, z1.s[3]\n"
+ "fmla z14.s, z27.s, z1.s[0]\n"
+ "fmla z26.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z23.s, z27.s, z1.s[3]\n"
"ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "fmla z25.s, z31.s, z0.s[0]\n"
- "fmla z24.s, z31.s, z0.s[1]\n"
- "fmla z23.s, z31.s, z0.s[2]\n"
- "fmla z22.s, z31.s, z0.s[3]\n"
+ "fmla z16.s, z20.s, z0.s[0]\n"
+ "fmla z25.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z0.s[2]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
"mov z0.d, z12.d\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[1]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
- "fmla z18.s, z31.s, z0.s[3]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
"ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "fmla z25.s, z30.s, z0.s[1]\n"
- "fmla z24.s, z30.s, z0.s[2]\n"
- "fmla z23.s, z30.s, z0.s[3]\n"
- "fmla z22.s, z30.s, z1.s[0]\n"
- "mov z0.d, z12.d\n"
- "mov z1.d, z13.d\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[2]\n"
- "fmla z19.s, z30.s, z0.s[3]\n"
- "fmla z18.s, z30.s, z1.s[0]\n"
- "mov z0.d, z10.d\n"
+ "fmla z16.s, z19.s, z0.s[1]\n"
+ "fmla z25.s, z19.s, z0.s[2]\n"
+ "fmla z15.s, z19.s, z0.s[3]\n"
+ "fmla z24.s, z19.s, z1.s[0]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z14.s, z19.s, z1.s[1]\n"
+ "fmla z26.s, z19.s, z1.s[2]\n"
+ "fmla z17.s, z19.s, z1.s[3]\n"
+ "fmla z23.s, z19.s, z0.s[0]\n"
+ "mov z1.d, z10.d\n"
"ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "mov z1.d, z11.d\n"
- "fmla z25.s, z29.s, z0.s[2]\n"
- "fmla z24.s, z29.s, z0.s[3]\n"
- "fmla z23.s, z29.s, z1.s[0]\n"
- "fmla z22.s, z29.s, z1.s[1]\n"
- "mov z0.d, z12.d\n"
- "mov z1.d, z13.d\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
- "fmla z20.s, z29.s, z0.s[3]\n"
- "fmla z19.s, z29.s, z1.s[0]\n"
- "fmla z18.s, z29.s, z1.s[1]\n"
- "mov z0.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z16.s, z18.s, z1.s[2]\n"
+ "fmla z25.s, z18.s, z1.s[3]\n"
+ "fmla z15.s, z18.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z14.s, z18.s, z1.s[2]\n"
+ "fmla z26.s, z18.s, z1.s[3]\n"
+ "fmla z17.s, z18.s, z0.s[0]\n"
+ "fmla z23.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
"ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "mov z1.d, z11.d\n"
- "fmla z25.s, z28.s, z0.s[3]\n"
- "fmla z24.s, z28.s, z1.s[0]\n"
- "fmla z23.s, z28.s, z1.s[1]\n"
- "fmla z22.s, z28.s, z1.s[2]\n"
- "mov z1.d, z13.d\n"
- "mov z0.d, z12.d\n"
- "fmla z20.s, z28.s, z1.s[0]\n"
- "fmla z19.s, z28.s, z1.s[1]\n"
- "fmla z18.s, z28.s, z1.s[2]\n"
- "mov z1.d, z11.d\n"
- "fmla z21.s, z28.s, z0.s[3]\n"
- "fmla z25.s, z27.s, z1.s[0]\n"
- "fmla z24.s, z27.s, z1.s[1]\n"
- "fmin z25.s, p1/M, z25.s, z16.s\n"
- "fmax z25.s, p1/M, z25.s, z17.s\n"
- "fmla z23.s, z27.s, z1.s[2]\n"
- "fmla z22.s, z27.s, z1.s[3]\n"
- "mov z1.d, z13.d\n"
- "fmin z24.s, p1/M, z24.s, z16.s\n"
- "fmla z21.s, z27.s, z1.s[0]\n"
- "fmla z20.s, z27.s, z1.s[1]\n"
- "fmin z23.s, p1/M, z23.s, z16.s\n"
- "fmin z22.s, p1/M, z22.s, z16.s\n"
- "fmla z19.s, z27.s, z1.s[2]\n"
- "fmla z18.s, z27.s, z1.s[3]\n"
- "fmin z21.s, p1/M, z21.s, z16.s\n"
- "fmin z20.s, p1/M, z20.s, z16.s\n"
- "fmin z19.s, p1/M, z19.s, z16.s\n"
- "fmin z18.s, p1/M, z18.s, z16.s\n"
- "st1w { z25.s }, p0, [x27, x10, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z16.s, z28.s, z1.s[3]\n"
+ "fmla z25.s, z28.s, z0.s[0]\n"
+ "fmla z15.s, z28.s, z0.s[1]\n"
+ "fmla z24.s, z28.s, z0.s[2]\n"
+ "mov z0.d, z13.d\n"
+ "mov z1.d, z12.d\n"
+ "fmla z26.s, z28.s, z0.s[0]\n"
+ "fmla z17.s, z28.s, z0.s[1]\n"
+ "fmla z23.s, z28.s, z0.s[2]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z28.s, z1.s[3]\n"
+ "fmla z16.s, z27.s, z0.s[0]\n"
+ "fmla z25.s, z27.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z22.s\n"
+ "fmax z16.s, p1/M, z16.s, z21.s\n"
+ "fmla z15.s, z27.s, z0.s[2]\n"
+ "fmla z24.s, z27.s, z0.s[3]\n"
+ "mov z0.d, z13.d\n"
+ "fmin z25.s, p1/M, z25.s, z22.s\n"
+ "fmla z14.s, z27.s, z0.s[0]\n"
+ "fmla z26.s, z27.s, z0.s[1]\n"
+ "fmin z15.s, p1/M, z15.s, z22.s\n"
+ "fmin z24.s, p1/M, z24.s, z22.s\n"
+ "fmla z17.s, z27.s, z0.s[2]\n"
+ "fmla z23.s, z27.s, z0.s[3]\n"
+ "fmin z14.s, p1/M, z14.s, z22.s\n"
+ "fmin z26.s, p1/M, z26.s, z22.s\n"
+ "fmin z17.s, p1/M, z17.s, z22.s\n"
+ "fmin z23.s, p1/M, z23.s, z22.s\n"
+ "st1w { z16.s }, p0, [x27, x10, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmax z24.s, p1/M, z24.s, z17.s\n"
- "st1w { z24.s }, p0, [x26, x10, LSL #2]\n"
- "mov z24.d, z25.d\n"
- "fmax z23.s, p1/M, z23.s, z17.s\n"
- "fmax z22.s, p1/M, z22.s, z17.s\n"
- "st1w { z23.s }, p0, [x25, x10, LSL #2]\n"
- "mov z23.d, z25.d\n"
- "fmax z21.s, p1/M, z21.s, z17.s\n"
- "fmax z20.s, p1/M, z20.s, z17.s\n"
- "st1w { z22.s }, p0, [x24, x10, LSL #2]\n"
- "mov z22.d, z25.d\n"
- "fmax z19.s, p1/M, z19.s, z17.s\n"
- "fmax z18.s, p1/M, z18.s, z17.s\n"
- "st1w { z21.s }, p0, [x23, x10, LSL #2]\n"
- "mov z21.d, z25.d\n"
- "st1w { z20.s }, p0, [x22, x10, LSL #2]\n"
- "mov z20.d, z25.d\n"
+ "fmax z25.s, p1/M, z25.s, z21.s\n"
+ "st1w { z25.s }, p0, [x26, x10, LSL #2]\n"
+ "mov z25.d, z16.d\n"
+ "fmax z15.s, p1/M, z15.s, z21.s\n"
+ "fmax z24.s, p1/M, z24.s, z21.s\n"
+ "st1w { z15.s }, p0, [x25, x10, LSL #2]\n"
+ "mov z15.d, z16.d\n"
+ "fmax z14.s, p1/M, z14.s, z21.s\n"
+ "fmax z26.s, p1/M, z26.s, z21.s\n"
+ "st1w { z24.s }, p0, [x24, x10, LSL #2]\n"
+ "mov z24.d, z16.d\n"
+ "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "fmax z23.s, p1/M, z23.s, z21.s\n"
+ "st1w { z14.s }, p0, [x23, x10, LSL #2]\n"
+ "mov z14.d, z16.d\n"
+ "st1w { z26.s }, p0, [x22, x10, LSL #2]\n"
+ "mov z26.d, z16.d\n"
"ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
"ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "st1w { z19.s }, p0, [x21, x10, LSL #2]\n"
- "mov z19.d, z25.d\n"
+ "st1w { z17.s }, p0, [x21, x10, LSL #2]\n"
+ "mov z17.d, z16.d\n"
"addvl %x[params], %x[params], #-6\n"
- "st1w { z18.s }, p0, [x20, x10, LSL #2]\n"
+ "st1w { z23.s }, p0, [x20, x10, LSL #2]\n"
"incw x10\n"
- "mov z18.d, z25.d\n"
+ "mov z23.d, z16.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index e1f0b50d89..f83767d8ae 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,13 +22,13 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -49,4 +49,4 @@ struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index a43b81d7e8..1770ec182c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -46,405 +46,405 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
__asm__ __volatile__(
"ptrue p1.b\n"
"mov x9, #0x0\n"
- "ld1rw { z10.s }, p1/Z, [%x[minmax_vals]]\n"
- "ld1rw { z13.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[minmax_vals], #4]\n"
"whilelt p0.s, x9, %x[n_output_channels]\n"
"1:" // Output channel loop
- "mov z5.b, #0x0\n"
+ "mov z31.b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ld1w { z5.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
"2:" // Output channel loop: Load bias: Done
- "mov x21, %x[inptrs]\n"
- "ldp x24, x28, [x21], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "mov z16.d, z5.d\n"
- "mov z17.d, z5.d\n"
- "mov z18.d, z5.d\n"
- "ld1rqw { z1.s }, p1/Z, [x24]\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "mov z19.d, z5.d\n"
- "mov z20.d, z5.d\n"
- "ld1rqw { z0.s }, p1/Z, [x28]\n"
- "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
- "mov z21.d, z5.d\n"
- "mov z22.d, z5.d\n"
+ "mov x23, %x[inptrs]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "lsr x22, %x[kernel_points], #0x1\n"
+ "mov z16.d, z31.d\n"
+ "mov z17.d, z31.d\n"
+ "mov z18.d, z31.d\n"
+ "ld1rqw { z6.s }, p1/Z, [x21]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+ "mov z19.d, z31.d\n"
+ "mov z20.d, z31.d\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+ "mov z21.d, z31.d\n"
+ "mov z22.d, z31.d\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
"addvl %x[weights], %x[weights], #1\n"
- "mov z23.d, z5.d\n"
- "mov z24.d, z5.d\n"
- "mov z25.d, z5.d\n"
- "mov z26.d, z5.d\n"
- "mov z27.d, z5.d\n"
- "mov z28.d, z5.d\n"
- "mov z29.d, z5.d\n"
- "mov z30.d, z5.d\n"
- "mov z31.d, z5.d\n"
- "cbz x20, 6f\n"
- "ldp x24, x28, [x21], #0x10\n"
- "subs x20, x20, #0x1\n"
- "ld1rqw { z5.s }, p1/Z, [x24]\n"
- "ld1rqw { z7.s }, p1/Z, [x24, #16]\n"
- "ld1rqw { z3.s }, p1/Z, [x28]\n"
- "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "mov z23.d, z31.d\n"
+ "mov z24.d, z31.d\n"
+ "mov z25.d, z31.d\n"
+ "mov z26.d, z31.d\n"
+ "mov z27.d, z31.d\n"
+ "mov z28.d, z31.d\n"
+ "mov z29.d, z31.d\n"
+ "mov z30.d, z31.d\n"
+ "mov z31.d, z31.d\n"
+ "cbz x22, 6f\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "subs x22, x22, #0x1\n"
+ "ld1rqw { z0.s }, p1/Z, [x21]\n"
+ "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20]\n"
+ "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
"ld1w { z11.s }, p1/Z, [%x[weights]]\n"
"addvl %x[weights], %x[weights], #1\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "ldp x24, x28, [x21], #0x10\n"
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "subs x20, x20, #0x1\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "ld1rqw { z1.s }, p1/Z, [x24]\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "ld1rqw { z0.s }, p1/Z, [x28]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
- "ldp x24, x28, [x21], #0x10\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "subs x22, x22, #0x1\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x21]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "fmla z16.s, z11.s, z5.s[0]\n"
- "fmla z17.s, z11.s, z5.s[1]\n"
- "fmla z18.s, z11.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z5.s[3]\n"
- "ld1rqw { z5.s }, p1/Z, [x24]\n"
- "fmla z20.s, z11.s, z7.s[0]\n"
- "fmla z21.s, z11.s, z7.s[1]\n"
- "fmla z22.s, z11.s, z7.s[2]\n"
- "fmla z23.s, z11.s, z7.s[3]\n"
- "ld1rqw { z7.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z11.s, z3.s[0]\n"
- "fmla z25.s, z11.s, z3.s[1]\n"
- "fmla z26.s, z11.s, z3.s[2]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "ld1rqw { z3.s }, p1/Z, [x28]\n"
- "fmla z28.s, z11.s, z2.s[0]\n"
- "fmla z29.s, z11.s, z2.s[1]\n"
- "fmla z30.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z2.s[3]\n"
- "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x21]\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
"ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
"addvl %x[weights], %x[weights], #2\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "fmla z16.s, z11.s, z5.s[0]\n"
- "fmla z17.s, z11.s, z5.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z13.s\n"
- "fmin z17.s, p1/M, z17.s, z13.s\n"
- "fmla z18.s, z11.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z5.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z13.s\n"
- "fmin z19.s, p1/M, z19.s, z13.s\n"
- "fmla z20.s, z11.s, z7.s[0]\n"
- "fmla z21.s, z11.s, z7.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z13.s\n"
- "fmin z21.s, p1/M, z21.s, z13.s\n"
- "fmla z22.s, z11.s, z7.s[2]\n"
- "fmla z23.s, z11.s, z7.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z13.s\n"
- "fmin z23.s, p1/M, z23.s, z13.s\n"
- "fmla z24.s, z11.s, z3.s[0]\n"
- "fmla z25.s, z11.s, z3.s[1]\n"
- "fmax z16.s, p1/M, z16.s, z10.s\n"
- "fmax z17.s, p1/M, z17.s, z10.s\n"
- "fmla z26.s, z11.s, z3.s[2]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmax z18.s, p1/M, z18.s, z10.s\n"
- "fmax z19.s, p1/M, z19.s, z10.s\n"
- "fmla z28.s, z11.s, z2.s[0]\n"
- "fmla z29.s, z11.s, z2.s[1]\n"
- "fmax z20.s, p1/M, z20.s, z10.s\n"
- "fmax z21.s, p1/M, z21.s, z10.s\n"
- "fmla z30.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z2.s[3]\n"
- "fmax z22.s, p1/M, z22.s, z10.s\n"
- "fmax z23.s, p1/M, z23.s, z10.s\n"
- "fmin z24.s, p1/M, z24.s, z13.s\n"
- "fmin z25.s, p1/M, z25.s, z13.s\n"
- "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmin z26.s, p1/M, z26.s, z13.s\n"
- "fmin z27.s, p1/M, z27.s, z13.s\n"
- "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmin z28.s, p1/M, z28.s, z13.s\n"
- "fmin z29.s, p1/M, z29.s, z13.s\n"
- "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin z30.s, p1/M, z30.s, z13.s\n"
- "fmin z31.s, p1/M, z31.s, z13.s\n"
- "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmax z24.s, p1/M, z24.s, z10.s\n"
- "fmax z25.s, p1/M, z25.s, z10.s\n"
- "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmax z26.s, p1/M, z26.s, z10.s\n"
- "fmax z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmax z28.s, p1/M, z28.s, z10.s\n"
- "fmax z29.s, p1/M, z29.s, z10.s\n"
- "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax z30.s, p1/M, z30.s, z10.s\n"
- "fmax z31.s, p1/M, z31.s, z10.s\n"
- "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
- "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
- "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
- "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "ldp x24, x28, [x21], #0x10\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "ld1rqw { z1.s }, p1/Z, [x24]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "ld1rqw { z0.s }, p1/Z, [x28]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
- "fmla z16.s, z11.s, z5.s[0]\n"
- "fmla z17.s, z11.s, z5.s[1]\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "ldp x20, x28, [x23], #0x10\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x20]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x20, #16]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x28]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ld1w { z10.s }, p1/Z, [%x[weights]]\n"
+ "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
"addvl %x[weights], %x[weights], #1\n"
- "fmla z18.s, z11.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z5.s[3]\n"
- "fmla z20.s, z11.s, z7.s[0]\n"
- "fmla z21.s, z11.s, z7.s[1]\n"
- "fmla z22.s, z11.s, z7.s[2]\n"
- "fmla z23.s, z11.s, z7.s[3]\n"
- "fmla z24.s, z11.s, z3.s[0]\n"
- "fmla z25.s, z11.s, z3.s[1]\n"
- "fmla z26.s, z11.s, z3.s[2]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z2.s[0]\n"
- "fmla z29.s, z11.s, z2.s[1]\n"
- "fmla z30.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z2.s[3]\n"
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z13.s\n"
- "fmin z17.s, p1/M, z17.s, z13.s\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z13.s\n"
- "fmin z19.s, p1/M, z19.s, z13.s\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z13.s\n"
- "fmin z21.s, p1/M, z21.s, z13.s\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z13.s\n"
- "fmin z23.s, p1/M, z23.s, z13.s\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "fmax z16.s, p1/M, z16.s, z10.s\n"
- "fmax z17.s, p1/M, z17.s, z10.s\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "fmax z18.s, p1/M, z18.s, z10.s\n"
- "fmax z19.s, p1/M, z19.s, z10.s\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "fmax z20.s, p1/M, z20.s, z10.s\n"
- "fmax z21.s, p1/M, z21.s, z10.s\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "fmax z22.s, p1/M, z22.s, z10.s\n"
- "fmax z23.s, p1/M, z23.s, z10.s\n"
- "fmin z24.s, p1/M, z24.s, z13.s\n"
- "fmin z25.s, p1/M, z25.s, z13.s\n"
- "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmin z26.s, p1/M, z26.s, z13.s\n"
- "fmin z27.s, p1/M, z27.s, z13.s\n"
- "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmin z28.s, p1/M, z28.s, z13.s\n"
- "fmin z29.s, p1/M, z29.s, z13.s\n"
- "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmin z30.s, p1/M, z30.s, z13.s\n"
- "fmin z31.s, p1/M, z31.s, z13.s\n"
- "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmax z24.s, p1/M, z24.s, z10.s\n"
- "fmax z25.s, p1/M, z25.s, z10.s\n"
- "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmax z26.s, p1/M, z26.s, z10.s\n"
- "fmax z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmax z28.s, p1/M, z28.s, z10.s\n"
- "fmax z29.s, p1/M, z29.s, z10.s\n"
- "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax z30.s, p1/M, z30.s, z10.s\n"
- "fmax z31.s, p1/M, z31.s, z10.s\n"
- "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
- "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
- "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
- "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "fmla z16.s, z10.s, z6.s[0]\n"
+ "fmla z17.s, z10.s, z6.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z10.s, z6.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z10.s, z5.s[0]\n"
+ "fmla z21.s, z10.s, z5.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z23.s, z10.s, z5.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z10.s, z1.s[0]\n"
+ "fmla z25.s, z10.s, z1.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z26.s, z10.s, z1.s[2]\n"
+ "fmla z27.s, z10.s, z1.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmla z28.s, z10.s, z2.s[0]\n"
+ "fmla z29.s, z10.s, z2.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z2.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla z16.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z8.s, z1.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z13.s\n"
- "fmin z17.s, p1/M, z17.s, z13.s\n"
- "fmla z18.s, z8.s, z1.s[2]\n"
- "fmla z19.s, z8.s, z1.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z13.s\n"
- "fmin z19.s, p1/M, z19.s, z13.s\n"
- "fmla z20.s, z8.s, z6.s[0]\n"
- "fmla z21.s, z8.s, z6.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z13.s\n"
- "fmin z21.s, p1/M, z21.s, z13.s\n"
- "fmla z22.s, z8.s, z6.s[2]\n"
- "fmla z23.s, z8.s, z6.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z13.s\n"
- "fmin z23.s, p1/M, z23.s, z13.s\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z0.s[1]\n"
- "ldr x20, [%x[outptrs], #0x0]\n"
- "ldr x21, [%x[outptrs], #0x8]\n"
- "fmla z26.s, z8.s, z0.s[2]\n"
- "fmla z27.s, z8.s, z0.s[3]\n"
- "ldr x22, [%x[outptrs], #0x10]\n"
- "ldr x23, [%x[outptrs], #0x18]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z4.s[1]\n"
- "ldr x24, [%x[outptrs], #0x20]\n"
- "ldr x25, [%x[outptrs], #0x28]\n"
- "fmla z30.s, z8.s, z4.s[2]\n"
- "fmla z31.s, z8.s, z4.s[3]\n"
- "ldr x26, [%x[outptrs], #0x30]\n"
- "ldr x27, [%x[outptrs], #0x38]\n"
- "fmax z16.s, p1/M, z16.s, z10.s\n"
- "fmax z17.s, p1/M, z17.s, z10.s\n"
- "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "fmax z18.s, p1/M, z18.s, z10.s\n"
- "fmax z19.s, p1/M, z19.s, z10.s\n"
- "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
- "ldr x21, [%x[outptrs], #0x48]\n"
- "fmax z20.s, p1/M, z20.s, z10.s\n"
- "fmax z21.s, p1/M, z21.s, z10.s\n"
- "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
- "ldr x22, [%x[outptrs], #0x50]\n"
- "fmax z22.s, p1/M, z22.s, z10.s\n"
- "fmax z23.s, p1/M, z23.s, z10.s\n"
- "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
- "ldr x23, [%x[outptrs], #0x58]\n"
- "fmin z24.s, p1/M, z24.s, z13.s\n"
- "fmin z25.s, p1/M, z25.s, z13.s\n"
- "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
- "ldr x24, [%x[outptrs], #0x60]\n"
- "fmin z26.s, p1/M, z26.s, z13.s\n"
- "fmin z27.s, p1/M, z27.s, z13.s\n"
- "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
- "ldr x25, [%x[outptrs], #0x68]\n"
- "fmin z28.s, p1/M, z28.s, z13.s\n"
- "fmin z29.s, p1/M, z29.s, z13.s\n"
- "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
- "ldr x26, [%x[outptrs], #0x70]\n"
- "fmin z30.s, p1/M, z30.s, z13.s\n"
- "fmin z31.s, p1/M, z31.s, z13.s\n"
- "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
- "ldr x27, [%x[outptrs], #0x78]\n"
- "fmax z24.s, p1/M, z24.s, z10.s\n"
- "fmax z25.s, p1/M, z25.s, z10.s\n"
- "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
- "fmax z26.s, p1/M, z26.s, z10.s\n"
- "fmax z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
- "fmax z28.s, p1/M, z28.s, z10.s\n"
- "fmax z29.s, p1/M, z29.s, z10.s\n"
- "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
- "fmax z30.s, p1/M, z30.s, z10.s\n"
- "fmax z31.s, p1/M, z31.s, z10.s\n"
- "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
- "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"7:" // Output channel loop: Done
"incw x9\n"
"whilelt p0.s, x9, %x[n_output_channels]\n"
"b.any 1b\n"
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 4e2ee43374..32ea009e8a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 4eae5961a0..0cee302c56 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,472 +30,464 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x13, #0x0\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x13, x21, [%x[inptrs], #0x30]\n"
"mov x20, #0x1\n"
- "ptrue p1.b\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ptrue p2.b\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
"orr x20, x20, #0x100\n"
"orr x20, x20, #0x10000\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "dup z12.s, w20\n"
- "mov x20, #0x0\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params]]\n"
- "ld1rw { z25.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z24.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z23.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1rw { z22.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "dup z25.s, w20\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+ "zip2 z16.b, z15.b, z31.b\n"
+ "zip1 z15.b, z15.b, z31.b\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z30.b, z21.b, z29.b\n"
+ "zip2 z29.b, z21.b, z29.b\n"
+ "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+ "zip2 z13.b, z15.b, z30.b\n"
+ "zip1 z15.b, z15.b, z30.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+ "zip1 z14.b, z16.b, z29.b\n"
+ "zip2 z29.b, z16.b, z29.b\n"
+ "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z31.b, z9.b, z5.b\n"
+ "zip1 z9.b, z9.b, z5.b\n"
+ "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+ "zip1 z21.b, z20.b, z17.b\n"
+ "zip2 z17.b, z20.b, z17.b\n"
+ "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+ "zip2 z23.b, z18.b, z6.b\n"
+ "zip1 z18.b, z18.b, z6.b\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+ "zip1 z24.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z22.b, z2.b, z16.b\n"
+ "zip1 z2.b, z2.b, z16.b\n"
+ "zip1 z0.b, z19.b, z5.b\n"
+ "zip2 z5.b, z19.b, z5.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z19.b, z9.b, z21.b\n"
+ "zip1 z9.b, z9.b, z21.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "zip1 z11.b, z31.b, z17.b\n"
+ "zip2 z17.b, z31.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z12.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z20.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z24.b, z2.b, z0.b\n"
+ "zip1 z2.b, z2.b, z0.b\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z0.b, z22.b, z5.b\n"
+ "zip2 z5.b, z22.b, z5.b\n"
"addvl %x[params], %x[params], #4\n"
- "mov z4.d, z10.d\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"1:" // Loop
- "mov z19.s, #0x0\n"
- "sdot z19.s, z12.b, z8.b\n"
- "sdot z10.s, z21.b, z14.b\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z19.s, z12.b, z3.b\n"
- "sdot z31.s, z21.b, z8.b\n"
- "incw x13, ALL, MUL #4\n"
- "sdot z10.s, z16.b, z8.b\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z30.b\n"
- "sdot z19.s, z12.b, z14.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "sdot z31.s, z16.b, z3.b\n"
- "sdot z10.s, z20.b, z3.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "sdot z4.s, z21.b, z14.b\n"
- "sdot z26.s, z21.b, z8.b\n"
- "mov z17.s, #0x0\n"
- "sdot z17.s, z12.b, z8.b\n"
- "sdot z17.s, z12.b, z3.b\n"
- "sdot z31.s, z20.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z4.s, z16.b, z8.b\n"
- "sdot z26.s, z16.b, z3.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z30.b\n"
- "mov z19.s, #0x0\n"
- "sdot z17.s, z12.b, z14.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params]]\n"
- "sdot z4.s, z20.b, z3.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "sdot z26.s, z20.b, z30.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- "sdot z19.s, z12.b, z7.b\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "sqadd z10.s, z10.s, z21.s\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "sdot z19.s, z12.b, z2.b\n"
- "and z16.d, z4.d, z8.d\n"
- "and z20.d, z31.d, z8.d\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z29.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sdot z19.s, z12.b, z13.b\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- "sqadd z31.s, z31.s, z20.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- "add z10.s, z10.s, z22.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "add z31.s, z31.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "sdot z31.s, z21.b, z7.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "sdot z10.s, z21.b, z13.b\n"
- "sdot z10.s, z16.b, z7.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- "sdot z4.s, z21.b, z13.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #7, MUL VL]\n"
- "mov z17.s, #0x0\n"
- "sdot z26.s, z21.b, z7.b\n"
- "sdot z17.s, z12.b, z7.b\n"
- "incw x20\n"
- "sdot z31.s, z16.b, z2.b\n"
- "sdot z10.s, z20.b, z2.b\n"
+ "mov z30.s, #0x0\n"
+ "sdot z30.s, z25.b, z9.b\n"
+ "sdot z10.s, z26.b, z15.b\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z30.s, z25.b, z18.b\n"
+ "sdot z31.s, z26.b, z9.b\n"
+ "mov z27.s, #0x0\n"
+ "incw x14, ALL, MUL #4\n"
+ "sdot z10.s, z3.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "movprfx z28, z30\n sdot z28.s, z25.b, z2.b\n"
+ "sdot z30.s, z25.b, z15.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "sdot z27.s, z25.b, z9.b\n"
+ "sdot z31.s, z3.b, z18.b\n"
+ "sdot z10.s, z1.b, z18.b\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "sdot z22.s, z26.b, z15.b\n"
+ "sdot z21.s, z26.b, z9.b\n"
+ "sdot z27.s, z25.b, z18.b\n"
+ "sdot z31.s, z1.b, z2.b\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z4.s, z16.b, z7.b\n"
- "sdot z26.s, z16.b, z2.b\n"
+ "sdot z22.s, z3.b, z9.b\n"
+ "sdot z21.s, z3.b, z18.b\n"
+ "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p2/M, z30.s, z8.s\n"
+ "movprfx z26, z27\n sdot z26.s, z25.b, z2.b\n"
+ "mov z9.s, #0x0\n"
+ "sdot z27.s, z25.b, z15.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "sdot z22.s, z1.b, z18.b\n"
+ ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
+ "sdot z21.s, z1.b, z2.b\n"
+ "mls z22.s, p2/M, z27.s, z8.s\n"
+ "and z18.d, z10.d, z3.d\n"
+ "mls z31.s, p2/M, z28.s, z8.s\n"
+ "mls z21.s, p2/M, z26.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "sdot z9.s, z25.b, z19.b\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ "sqadd z10.s, z10.s, z18.s\n"
+ ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
+ "sdot z9.s, z25.b, z12.b\n"
+ "and z28.d, z22.d, z3.d\n"
+ "and z23.d, z31.d, z3.d\n"
+ "movprfx z27, z9\n sdot z27.s, z25.b, z24.b\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z21.d, z3.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sdot z9.s, z25.b, z13.b\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
+ "smax z10.s, p2/M, z10.s, z7.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z21.s, p2/M, z21.s, z7.s\n"
+ "st1b { z10.s }, p0, [x12, x28]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "st1b { z22.s }, p0, [x11, x28]\n"
+ "mov z26.d, z28.d\n"
+ "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z31.d, z28.d\n"
+ "sdot z31.s, z1.b, z19.b\n"
+ "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x9, x28]\n"
+ "mov z22.d, z28.d\n"
+ "sdot z28.s, z1.b, z13.b\n"
+ "sdot z28.s, z15.b, z19.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "sdot z26.s, z1.b, z13.b\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z18.s, #0x0\n"
+ "sdot z22.s, z1.b, z19.b\n"
+ "sdot z18.s, z25.b, z19.b\n"
+ "incw x28\n"
+ "sdot z31.s, z15.b, z12.b\n"
+ "sdot z28.s, z23.b, z12.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z26.s, z15.b, z19.b\n"
+ "sdot z22.s, z15.b, z12.b\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z17.s, z12.b, z2.b\n"
- "sdot z31.s, z20.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "sdot z4.s, z20.b, z2.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "sdot z26.s, z20.b, z29.b\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z29.b\n"
- "and z21.d, z10.d, z8.d\n"
- "sdot z17.s, z12.b, z13.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "mov z19.s, #0x0\n"
- "sdot z19.s, z12.b, z6.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "sdot z19.s, z12.b, z1.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "sdot z31.s, z21.b, z6.b\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z28.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "sdot z10.s, z21.b, z11.b\n"
- "sdot z10.s, z16.b, z6.b\n"
- "sdot z19.s, z12.b, z11.b\n"
+ "sdot z18.s, z25.b, z12.b\n"
+ "sdot z31.s, z23.b, z24.b\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "mls z28.s, p2/M, z9.s, z8.s\n"
+ "sdot z26.s, z23.b, z12.b\n"
+ ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
+ "sdot z22.s, z23.b, z24.b\n"
+ "movprfx z12, z18\n sdot z12.s, z25.b, z24.b\n"
+ "and z2.d, z28.d, z21.d\n"
+ "sdot z18.s, z25.b, z13.b\n"
+ "mls z26.s, p2/M, z18.s, z8.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "mls z31.s, p2/M, z27.s, z8.s\n"
+ "mls z22.s, p2/M, z12.s, z8.s\n"
+ ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
+ ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z28.s, z28.s, z2.s\n"
+ "and z24.d, z26.d, z21.d\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "and z23.d, z31.d, z21.d\n"
+ "and z18.d, z22.d, z21.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z24.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z25.b, z11.b\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "st1b { z28.s }, p0, [x12, x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "st1b { z26.s }, p0, [x11, x28]\n"
+ "mov z28.d, z23.d\n"
+ "sdot z24.s, z25.b, z20.b\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z27.d, z23.d\n"
+ "sdot z27.s, z19.b, z11.b\n"
+ "movprfx z13, z24\n sdot z13.s, z25.b, z0.b\n"
+ "st1b { z22.s }, p0, [x9, x28]\n"
+ "mov z26.d, z23.d\n"
+ "sdot z23.s, z19.b, z14.b\n"
+ "sdot z23.s, z30.b, z11.b\n"
+ "sdot z24.s, z25.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z28.s, z19.b, z14.b\n"
"ext z11.b, z11.b, z11.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z4.s, z21.b, z11.b\n"
- "ext z6.b, z6.b, z6.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "sdot z26.s, z21.b, z6.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z17.s, z12.b, z6.b\n"
- "sdot z31.s, z16.b, z1.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z10.s, z20.b, z1.b\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z4.s, z16.b, z6.b\n"
- "sdot z26.s, z16.b, z1.b\n"
- "sdot z17.s, z12.b, z1.b\n"
- "sdot z31.s, z20.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "sdot z4.s, z20.b, z1.b\n"
- "sdot z26.s, z20.b, z28.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z28.b\n"
- "sdot z17.s, z12.b, z11.b\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ "mov z12.s, #0x0\n"
+ "sdot z26.s, z19.b, z11.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z12.s, z25.b, z11.b\n"
+ "sdot z27.s, z30.b, z20.b\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z23.s, z21.b, z20.b\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "sdot z28.s, z30.b, z11.b\n"
+ "sdot z26.s, z30.b, z20.b\n"
+ "sdot z12.s, z25.b, z20.b\n"
+ "sdot z27.s, z21.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "mls z23.s, p2/M, z24.s, z8.s\n"
+ "sdot z28.s, z21.b, z20.b\n"
+ "sdot z26.s, z21.b, z0.b\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ "movprfx z19, z12\n sdot z19.s, z25.b, z0.b\n"
+ "sdot z12.s, z25.b, z14.b\n"
+ "and z18.d, z23.d, z22.d\n"
+ "mls z28.s, p2/M, z12.s, z8.s\n"
+ "mls z27.s, p2/M, z13.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "mls z26.s, p2/M, z19.s, z8.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "and z20.d, z28.d, z22.d\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "and z19.d, z27.d, z22.d\n"
+ "and z18.d, z26.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params]]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z20.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+ "sqadd z27.s, z27.s, z19.s\n"
+ "sqadd z26.s, z26.s, z18.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "smax z23.s, p2/M, z23.s, z7.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "smax z27.s, p2/M, z27.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z25.b, z17.b\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "st1b { z23.s }, p0, [x12, x28]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "st1b { z28.s }, p0, [x11, x28]\n"
+ "mov z0.d, z1.d\n"
+ "sdot z24.s, z25.b, z4.b\n"
+ "st1b { z27.s }, p0, [x10, x28]\n"
+ "mov z31.d, z1.d\n"
+ "sdot z31.s, z21.b, z17.b\n"
+ "movprfx z23, z24\n sdot z23.s, z25.b, z5.b\n"
+ "st1b { z26.s }, p0, [x9, x28]\n"
+ "mov z30.d, z1.d\n"
+ "sdot z1.s, z21.b, z29.b\n"
+ "sdot z1.s, z13.b, z17.b\n"
+ "sdot z24.s, z25.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z0.s, z21.b, z29.b\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
"mov z19.s, #0x0\n"
- "sdot z19.s, z12.b, z5.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "sdot z19.s, z12.b, z0.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "sdot z31.s, z21.b, z5.b\n"
- "movprfx z18, z19\n sdot z18.s, z12.b, z27.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "sdot z10.s, z21.b, z9.b\n"
- "sdot z10.s, z16.b, z5.b\n"
- "sdot z19.s, z12.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "sdot z4.s, z21.b, z9.b\n"
+ "sdot z30.s, z21.b, z17.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z19.s, z25.b, z17.b\n"
+ "sdot z31.s, z13.b, z4.b\n"
+ "incw x28\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z1.s, z20.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "sdot z0.s, z13.b, z17.b\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "sdot z30.s, z13.b, z4.b\n"
+ "sdot z19.s, z25.b, z4.b\n"
+ "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "sdot z31.s, z20.b, z5.b\n"
"ext z5.b, z5.b, z5.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "sdot z26.s, z21.b, z5.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "sdot z17.s, z12.b, z5.b\n"
- "sdot z31.s, z16.b, z0.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "sdot z10.s, z20.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "sdot z4.s, z16.b, z5.b\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "sdot z26.s, z16.b, z0.b\n"
- "sdot z17.s, z12.b, z0.b\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "sdot z31.s, z20.b, z27.b\n"
- "ext z27.b, z27.b, z27.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "sdot z4.s, z20.b, z0.b\n"
- "sdot z26.s, z20.b, z27.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "movprfx z16, z17\n sdot z16.s, z12.b, z27.b\n"
- "sdot z17.s, z12.b, z9.b\n"
- "and z21.d, z10.d, z8.d\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z1.s, p2/M, z24.s, z8.s\n"
+ "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+ "sdot z0.s, z20.b, z4.b\n"
+ "sdot z30.s, z20.b, z5.b\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+ "movprfx z18, z19\n sdot z18.s, z25.b, z5.b\n"
+ "sdot z19.s, z25.b, z29.b\n"
+ "and z11.d, z1.d, z22.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "mls z0.s, p2/M, z19.s, z8.s\n"
+ "mls z31.s, p2/M, z23.s, z8.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+ "mls z30.s, p2/M, z18.s, z8.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "sqadd z1.s, z1.s, z11.s\n"
+ "and z21.d, z0.d, z22.d\n"
+ ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z22.d\n"
+ "and z19.d, z30.d, z22.d\n"
+ "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z11.b }, p0/Z, [x22, x14]\n"
"asr z21.s, z21.s, #0x1f\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "asr z16.s, z16.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z21.s\n"
+ ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
"sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "incw x20\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "sqadd z30.s, z30.s, z19.s\n"
+ ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
+ ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "smax z1.s, p2/M, z1.s, z7.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z1.s, p2/M, z1.s, z6.s\n"
+ "smax z0.s, p2/M, z0.s, z7.s\n"
+ "st1b { z1.s }, p1, [x12, x28]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z20.b, z15.b, z28.b\n"
+ "zip1 z15.b, z15.b, z28.b\n"
+ "smin z0.s, p2/M, z0.s, z6.s\n"
+ "zip1 z19.b, z13.b, z29.b\n"
+ "zip2 z29.b, z13.b, z29.b\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "st1b { z0.s }, p1, [x11, x28]\n"
+ "zip2 z13.b, z15.b, z19.b\n"
+ "zip1 z15.b, z15.b, z19.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p1, [x10, x28]\n"
+ "zip1 z14.b, z20.b, z29.b\n"
+ "zip2 z29.b, z20.b, z29.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z30.s }, p1, [x9, x28]\n"
+ "zip2 z21.b, z9.b, z26.b\n"
+ "zip1 z9.b, z9.b, z26.b\n"
+ "incw x28\n"
+ "zip1 z20.b, z27.b, z17.b\n"
+ "zip2 z17.b, z27.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z27.b, z11.b, z4.b\n"
+ "zip2 z4.b, z11.b, z4.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
- "mov z4.d, z10.d\n"
+ "zip2 z30.b, z2.b, z22.b\n"
+ "zip1 z2.b, z2.b, z22.b\n"
+ "zip1 z28.b, z23.b, z5.b\n"
+ "zip2 z5.b, z23.b, z5.b\n"
+ "zip2 z19.b, z9.b, z20.b\n"
+ "zip1 z9.b, z9.b, z20.b\n"
+ "zip1 z11.b, z21.b, z17.b\n"
+ "zip2 z17.b, z21.b, z17.b\n"
+ "zip2 z12.b, z18.b, z27.b\n"
+ "zip1 z18.b, z18.b, z27.b\n"
+ "zip1 z20.b, z31.b, z4.b\n"
+ "zip2 z4.b, z31.b, z4.b\n"
+ "zip2 z24.b, z2.b, z28.b\n"
+ "zip1 z2.b, z2.b, z28.b\n"
+ "zip1 z0.b, z30.b, z5.b\n"
+ "zip2 z5.b, z30.b, z5.b\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 3e9765165c..c9b4daf334 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4ebf5be285..8ac522dc9a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,320 +91,320 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x8, #0x0\n"
+ "mov x16, #0x0\n"
"ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
"ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x8\n"
+ "mov x23, x16\n"
"add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x15, x14, [x24, #0x0]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
"incw x23\n"
- "whilelt p3.h, x8, x17\n"
- "ldp x13, x12, [x24, #0x10]\n"
- "whilelt p2.s, x8, x17\n"
- "whilelt p1.s, x23, x17\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "mov z26.d, z13.d\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z24.d, z13.d\n"
- "ld1sb { z31.h }, p3/Z, [x24, x8]\n"
- "ld1sb { z30.h }, p3/Z, [x23, x8]\n"
- "mov z16.d, z17.d\n"
- "mov z25.d, z13.d\n"
- "ld1sb { z29.h }, p3/Z, [x22, x8]\n"
- "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
- "mov z9.d, z17.d\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
"1:" // Loop
- ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x22, [x11, #0x28]\n"
- "ldr x27, [x11, #0x38]\n"
- ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- "ldr x21, [x11, #0x30]\n"
- "ldr x26, [x11, #0x40]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x11, #0x48]\n"
- "ld1sb { z30.h }, p3/Z, [x20, x8]\n"
- ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
- "ld1sb { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
- "ldr x25, [x11, #0x50]\n"
- "ldr x24, [x11, #0x58]\n"
- ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
- "ld1sb { z31.h }, p3/Z, [x22, x8]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- "ldr x23, [x11, #0x60]\n"
- ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- "ldr x22, [x11, #0x68]\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
- "ldr x20, [x11, #0x78]\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x27, x8]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z19.s, z20.s, z18.s\n"
- ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
- ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
- "uzp2 z22.s, z20.s, z18.s\n"
- "ld1w { z20.s }, p2/Z, [x28]\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
- "ld1sb { z31.h }, p3/Z, [x26, x8]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "whilelt p0.h, x10, x17\n"
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1sb { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1sb { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1273 // ssublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1sb { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ "ld1sb { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "ld1sb { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
"inch x16\n"
- ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z28.h }, p3/Z, [x24, x8]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- "ld1sb { z29.h }, p3/Z, [x25, x8]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
- "ld1sb { z31.h }, p3/Z, [x23, x8]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
- "uzp1 z1.s, z20.s, z18.s\n"
- ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- "uzp2 z27.s, z20.s, z18.s\n"
- ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
- "ld1sb { z30.h }, p3/Z, [x22, x8]\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "ld1sb { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
- ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1sb { z28.h }, p3/Z, [x20, x8]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "inch x8\n"
- ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z21.d, z13.d, z1.d\n"
- "mov x20, x8\n"
- ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
"incw x20\n"
- ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
- "asr z21.s, z21.s, #0x1f\n"
- "whilelt p2.s, x8, x17\n"
- ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
- ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "and z20.d, z17.d, z27.d\n"
- "whilelt p1.s, x20, x17\n"
- ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
- ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
- "whilelt p3.h, x8, x17\n"
- ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
- "sqadd z13.s, z13.s, z21.s\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z26.d, z1.d\n"
- ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
- "and z18.d, z24.d, z1.d\n"
- ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
- "and z21.d, z25.d, z1.d\n"
- ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z10.d, z27.d\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z22.d, z16.d, z27.d\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z20.d, z9.d, z27.d\n"
- "sqadd z26.s, z26.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "asr z22.s, z22.s, #0x1f\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- "sqadd z25.s, z25.s, z21.s\n"
+ "and z7.d, z25.d, z1.d\n"
"asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- "sqadd z10.s, z10.s, z2.s\n"
- "sqadd z16.s, z16.s, z22.s\n"
- ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
- ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
- "sqadd z9.s, z9.s, z20.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
- ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
- ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
- ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
- ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
- ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z12.h\n"
- "smin z13.h, p4/M, z13.h, z11.h\n"
- "sqadd z26.h, z26.h, z14.h\n"
- "sqadd z24.h, z24.h, z14.h\n"
- "smax z26.h, p4/M, z26.h, z12.h\n"
- "smax z24.h, p4/M, z24.h, z12.h\n"
- "sqadd z25.h, z25.h, z14.h\n"
- "smax z25.h, p4/M, z25.h, z12.h\n"
- "smin z26.h, p4/M, z26.h, z11.h\n"
- "st1b { z13.h }, p0, [x15, x10]\n"
- "smin z24.h, p4/M, z24.h, z11.h\n"
- "smin z25.h, p4/M, z25.h, z11.h\n"
- "st1b { z26.h }, p0, [x14, x10]\n"
- "st1b { z24.h }, p0, [x13, x10]\n"
- "st1b { z25.h }, p0, [x12, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z26.d, z13.d\n"
- "mov z10.d, z17.d\n"
- "ld1sb { z31.h }, p3/Z, [x24, x8]\n"
- "ld1sb { z30.h }, p3/Z, [x23, x8]\n"
- "mov z24.d, z13.d\n"
- "mov z16.d, z17.d\n"
- "ld1sb { z29.h }, p3/Z, [x22, x8]\n"
- "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
- "mov z25.d, z13.d\n"
- "mov z9.d, z17.d\n"
- "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 78bcd1407f..7a9b8a5bde 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 357c9f8399..fc9a48bb46 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -110,13 +110,13 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z16.h }, p4/Z, [x21]\n"
- "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
"ldp x16, x15, [x24, #0x0]\n"
"incw x23\n"
"whilelt p3.h, x7, x8\n"
@@ -124,320 +124,320 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"whilelt p2.s, x7, x8\n"
"whilelt p1.s, x23, x8\n"
"ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"add x11, %x[params], %[offsetof_Params_inptrs]\n"
"mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
"addvl x12, x12, #2\n"
- "mov z9.d, z13.d\n"
+ "mov z18.d, z8.d\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z11.d, z13.d\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1sb { z31.h }, p3/Z, [x9, x7]\n"
- "mov z22.d, z17.d\n"
- "mov z21.d, z13.d\n"
- "ld1sb { z30.h }, p3/Z, [x28, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- "mov z18.d, z17.d\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
- "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
"str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
"1:" // Loop
- ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
- "ldr x25, [x11, #0x40]\n"
- "ldr x24, [x11, #0x48]\n"
- ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
- "ldr x22, [x11, #0x50]\n"
- "ldr x20, [x11, #0x58]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x23, [x11, #0x78]\n"
- "ldr x21, [x11, #0x60]\n"
- ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
- "ld1sb { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
- ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
- "ldr x22, [x11, #0x80]\n"
- "ldr x20, [x11, #0x68]\n"
- ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ldr x21, [x11, #0x88]\n"
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1sb { z17.h }, p3/Z, [x21, x7]\n"
"ld1sb { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x11, #0x70]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
- "ldr x25, [x11, #0x98]\n"
- "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
- "ldr x24, [x11, #0x90]\n"
- ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1sb { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1sb { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ "ld1sb { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
- "ldr x23, [x11, #0xa8]\n"
- "ldr x20, [x11, #0xa0]\n"
- ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
- "ldr x22, [x11, #0xb0]\n"
- "ldr x21, [x11, #0xb8]\n"
- ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z31.s }, p2/Z, [x27]\n"
- ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1sb { z25.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
- "uzp1 z19.s, z31.s, z20.s\n"
- ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
- ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
- "uzp2 z30.s, z31.s, z20.s\n"
- "ld1w { z31.s }, p2/Z, [x26]\n"
- ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x23, x7]\n"
- ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
- ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
- "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
- ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z1.s, z31.s, z20.s\n"
- ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1sb { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
"ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
- "uzp2 z31.s, z31.s, z20.s\n"
- "inch x17\n"
- ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
- "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
- "and z0.d, z13.d, z1.d\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1sb { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
+ "ld1sb { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "ld1sb { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a13de // ssublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
"inch x7\n"
- ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
- ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
"mov x20, x7\n"
- ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
- "asr z0.s, z0.s, #0x1f\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
"incw x20\n"
- ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
- "and z20.d, z17.d, z31.d\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
"whilelt p2.s, x7, x8\n"
- ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
"whilelt p1.s, x20, x8\n"
- ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
"whilelt p3.h, x7, x8\n"
- "sqadd z13.s, z13.s, z0.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "addvl x27, x27, #2\n"
- "and z19.d, z9.d, z1.d\n"
- ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
- "addvl x26, x26, #2\n"
- "and z2.d, z11.d, z1.d\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "and z0.d, z21.d, z1.d\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
- "and z3.d, z10.d, z31.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "and z26.d, z22.d, z31.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z20.d, z18.d, z31.d\n"
- "sqadd z9.s, z9.s, z19.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z11.s, z11.s, z2.s\n"
- ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z21.s, z21.s, z0.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "sqadd z10.s, z10.s, z3.s\n"
- ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
- "sqadd z22.s, z22.s, z26.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
- ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
- ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
- ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
- ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
- ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "sqadd z9.h, z9.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z16.h\n"
- "smax z9.h, p4/M, z9.h, z16.h\n"
- "sqadd z11.h, z11.h, z14.h\n"
- "sqadd z21.h, z21.h, z14.h\n"
- "smax z11.h, p4/M, z11.h, z16.h\n"
- "smax z21.h, p4/M, z21.h, z16.h\n"
- "smin z13.h, p4/M, z13.h, z15.h\n"
- "smin z9.h, p4/M, z9.h, z15.h\n"
- "st1b { z13.h }, p0, [x16, x10]\n"
- "smin z11.h, p4/M, z11.h, z15.h\n"
- "smin z21.h, p4/M, z21.h, z15.h\n"
- "st1b { z9.h }, p0, [x15, x10]\n"
- "st1b { z11.h }, p0, [x14, x10]\n"
- "st1b { z21.h }, p0, [x13, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z9.d, z13.d\n"
- "mov z10.d, z17.d\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1sb { z31.h }, p3/Z, [x9, x7]\n"
- "mov z11.d, z13.d\n"
- "mov z22.d, z17.d\n"
- "ld1sb { z30.h }, p3/Z, [x28, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
- "mov z21.d, z13.d\n"
- "mov z18.d, z17.d\n"
- "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
- ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
- ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
- ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
- ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
- ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
@@ -448,4 +448,4 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 41ecd520ae..1f8d6c5213 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index d8f4d8d199..7ff724ddd8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,542 +111,542 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x0, #0x0\n"
- "mov x24, x0\n"
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"incw x24\n"
- "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
"add x21, x23, %[offsetof_Requantize32_a_offset]\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z15.b }, p4/Z, [x21]\n"
- "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x3, x4, [x22, #0x0]\n"
- "whilelt p3.h, x0, x1\n"
- "ldp x5, x6, [x22, #0x10]\n"
- "whilelt p2.s, x0, x1\n"
- "whilelt p1.s, x24, x1\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "mov x8, #0x0\n"
- "mov z20.d, z14.d\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z7.d, z10.d\n"
- "mov z8.d, z14.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z16.d, z10.d\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
"mov z6.d, z14.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
- "mov z5.d, z10.d\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
- "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ld1sb { z29.h }, p3/Z, [x27, x0]\n"
- "ld1sb { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- "ld1sb { z27.h }, p3/Z, [x25, x0]\n"
- "ld1sb { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- "ld1sb { z25.h }, p3/Z, [x23, x0]\n"
- "ld1sb { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13bd // ssublb z29.h, z29.b, z15.b\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- "ld1sb { z26.h }, p3/Z, [x21, x0]\n"
- "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
"1:" // Loop
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x7, #0x50]\n"
- "ld1sb { z31.h }, p3/Z, [x20, x0]\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x7, #0x58]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x21, [x7, #0x60]\n"
- "ldr x20, [x7, #0x68]\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z30.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
- "ldr x25, [x7, #0x70]\n"
- "ldr x24, [x7, #0x78]\n"
- ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x15, [x7, #0x80]\n"
- "ldr x23, [x7, #0x88]\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
- "ld1sb { z27.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- "ldr x22, [x7, #0x90]\n"
- "ldr x21, [x7, #0x98]\n"
- ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- "ldr x14, [x7, #0xa0]\n"
- "ldr x13, [x7, #0xa8]\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z25.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- "ldr x12, [x7, #0xb0]\n"
- "ldr x20, [x7, #0xb8]\n"
- ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ldr x11, [x7, #0xc0]\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z24.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
- "ldr x10, [x7, #0xc8]\n"
- "ldr x9, [x7, #0xd0]\n"
- ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- "ldr x28, [x7, #0xd8]\n"
- "ldr x27, [x7, #0xe0]\n"
- ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [x7, #0xe8]\n"
- "ldr x25, [x7, #0xf0]\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- "ld1w { z19.s }, p2/Z, [x17]\n"
- "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z28.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
- "ldr x24, [x7, #0xf8]\n"
- "uzp1 z9.s, z19.s, z18.s\n"
- ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- "uzp2 z29.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16]\n"
- ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z23.h }, p3/Z, [x15, x0]\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
- "ldr x23, [x7, #0x100]\n"
- "whilelt p0.h, x8, x1\n"
- ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- "addvl x17, x17, #2\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z31.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
- "ldr x22, [x7, #0x108]\n"
- ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z30.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
- "ldr x21, [x7, #0x110]\n"
- ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z26.h }, p3/Z, [x14, x0]\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
- "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
- ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x7, #0x118]\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z25.h }, p3/Z, [x13, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
- ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x12, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x11, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z23.h }, p3/Z, [x10, x0]\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x27, x0]\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z26.h }, p3/Z, [x26, x0]\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z25.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z24.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
- ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
- "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- "inch x2\n"
- ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z25.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1sb { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1sb { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1sb { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1sb { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1sb { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1sb { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1sb { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1210 // ssublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1sb { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
- "uzp1 z23.s, z19.s, z18.s\n"
- ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
- "uzp2 z22.s, z19.s, z18.s\n"
- ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
- "inch x0\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
- "and z21.d, z14.d, z23.d\n"
- "mov x20, x0\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1sb { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1sb { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1sb { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1sb { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
"incw x20\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
- ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
- "whilelt p2.s, x0, x1\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- "and z3.d, z10.d, z22.d\n"
- "whilelt p1.s, x20, x1\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
- "whilelt p3.h, x0, x1\n"
- ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sqadd z14.s, z14.s, z21.s\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
"asr z3.s, z3.s, #0x1f\n"
- ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
- "and z19.d, z20.d, z23.d\n"
- ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
- "and z18.d, z8.d, z23.d\n"
- ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
- "and z21.d, z6.d, z23.d\n"
- ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
- "sqadd z10.s, z10.s, z3.s\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
- "and z1.d, z7.d, z22.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z2.d, z16.d, z22.d\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
"asr z21.s, z21.s, #0x1f\n"
- "and z3.d, z5.d, z22.d\n"
- "sqadd z20.s, z20.s, z19.s\n"
- ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z8.s, z8.s, z18.s\n"
- ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "sqadd z6.s, z6.s, z21.s\n"
- ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z7.s, z7.s, z1.s\n"
- ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z5.s, z5.s, z3.s\n"
- ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
- ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
- ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
- ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
- ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
- "sqadd z14.h, z14.h, z12.h\n"
- "sqadd z20.h, z20.h, z12.h\n"
- "smax z14.h, p4/M, z14.h, z13.h\n"
- "smax z20.h, p4/M, z20.h, z13.h\n"
- "sqadd z8.h, z8.h, z12.h\n"
- "sqadd z6.h, z6.h, z12.h\n"
- "smax z8.h, p4/M, z8.h, z13.h\n"
- "smax z6.h, p4/M, z6.h, z13.h\n"
- "smin z14.h, p4/M, z14.h, z11.h\n"
- "smin z20.h, p4/M, z20.h, z11.h\n"
- "st1b { z14.h }, p0, [x3, x8]\n"
- "smin z8.h, p4/M, z8.h, z11.h\n"
- "smin z6.h, p4/M, z6.h, z11.h\n"
- "st1b { z20.h }, p0, [x4, x8]\n"
- "st1b { z8.h }, p0, [x5, x8]\n"
- "st1b { z6.h }, p0, [x6, x8]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "inch x8\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z20.d, z14.d\n"
- "mov z7.d, z10.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z8.d, z14.d\n"
- "mov z16.d, z10.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
"mov z6.d, z14.d\n"
- "mov z5.d, z10.d\n"
- "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
- "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- "ld1sb { z29.h }, p3/Z, [x27, x0]\n"
- "ld1sb { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- "ld1sb { z27.h }, p3/Z, [x25, x0]\n"
- "ld1sb { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
- "ld1sb { z25.h }, p3/Z, [x23, x0]\n"
- "ld1sb { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
- ".inst 0x454f13bd // ssublb z29.h, z29.b, z15.b\n"
- "ld1sb { z26.h }, p3/Z, [x21, x0]\n"
- "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
- ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
- ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 2e8c2019db..abc09ee5a3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 6fba4d47d2..274b29dcfc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -47,285 +47,285 @@ void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr x20, [%x[inptrs], #0x10]\n"
"ldr x22, [%x[inptrs], #0x20]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "mov z15.b, #0x1\n"
- "lsr z15.s, z15.s, #0x8\n"
+ "mov z13.b, #0x1\n"
+ "lsr z13.s, z13.s, #0x8\n"
"ld1b { z1.b }, p0/Z, [x23]\n"
"ld1b { z2.b }, p0/Z, [x20]\n"
- "mov z30.d, z1.d\n"
- "mov z29.d, z1.d\n"
+ "mov z8.d, z1.d\n"
+ "mov z27.d, z1.d\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z28.d, z1.d\n"
- "mov z27.d, z2.d\n"
+ "mov z31.d, z1.d\n"
+ "mov z28.d, z2.d\n"
"ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z30.d, z2.d\n"
"mov z26.d, z2.d\n"
- "mov z25.d, z2.d\n"
"ld1b { z3.b }, p0/Z, [x20]\n"
- "mov z24.d, z4.d\n"
- "mov z23.d, z4.d\n"
- "ptrue p2.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z22.d, z4.d\n"
- "ext z30.b, z30.b, z30.b, #0x2\n"
+ "mov z10.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z18.d, z4.d\n"
+ "ext z8.b, z8.b, z8.b, #0x2\n"
"lsl x10, %x[n_channels], #0x2\n"
- "neg z14.s, p2/M, z14.s\n"
- "ext z29.b, z29.b, z29.b, #0x4\n"
- "ext z28.b, z28.b, z28.b, #0x6\n"
+ "neg z11.s, p2/M, z11.s\n"
+ "ext z27.b, z27.b, z27.b, #0x4\n"
+ "ext z31.b, z31.b, z31.b, #0x6\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
- "ext z27.b, z27.b, z27.b, #0x2\n"
- "ext z26.b, z26.b, z26.b, #0x4\n"
- "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ext z28.b, z28.b, z28.b, #0x2\n"
+ "ext z30.b, z30.b, z30.b, #0x4\n"
+ "ld1w { z14.s }, p0/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "ext z25.b, z25.b, z25.b, #0x6\n"
- "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ext z22.b, z22.b, z22.b, #0x2\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "ext z23.b, z23.b, z23.b, #0x4\n"
- "ext z22.b, z22.b, z22.b, #0x6\n"
+ "ext z10.b, z10.b, z10.b, #0x4\n"
+ "ext z18.b, z18.b, z18.b, #0x6\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"mov z21.d, z0.d\n"
"mov z20.d, z0.d\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"mov z19.d, z0.d\n"
- "mov z18.d, z3.d\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z24.d, z3.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z17.d, z3.d\n"
"mov z16.d, z3.d\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"ext z21.b, z21.b, z21.b, #0x2\n"
"ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
"ext z19.b, z19.b, z19.b, #0x6\n"
- "zip1 z1.s, z1.s, z29.s\n"
- "zip1 z30.s, z30.s, z28.s\n"
- "zip1 z2.s, z2.s, z26.s\n"
- "zip1 z27.s, z27.s, z25.s\n"
- "ext z18.b, z18.b, z18.b, #0x2\n"
+ "zip1 z1.s, z1.s, z27.s\n"
+ "zip1 z8.s, z8.s, z31.s\n"
+ "zip1 z2.s, z2.s, z30.s\n"
+ "zip1 z28.s, z28.s, z26.s\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
"ext z17.b, z17.b, z17.b, #0x4\n"
"ext z16.b, z16.b, z16.b, #0x6\n"
- "zip1 z4.s, z4.s, z23.s\n"
- "zip1 z24.s, z24.s, z22.s\n"
+ "zip1 z4.s, z4.s, z10.s\n"
+ "zip1 z22.s, z22.s, z18.s\n"
"zip1 z0.s, z0.s, z20.s\n"
"zip1 z21.s, z21.s, z19.s\n"
- "zip1 z1.s, z1.s, z30.s\n"
- "zip1 z2.s, z2.s, z27.s\n"
+ "zip1 z1.s, z1.s, z8.s\n"
+ "zip1 z2.s, z2.s, z28.s\n"
"zip1 z3.s, z3.s, z17.s\n"
- "zip1 z18.s, z18.s, z16.s\n"
- "zip1 z4.s, z4.s, z24.s\n"
+ "zip1 z24.s, z24.s, z16.s\n"
+ "zip1 z4.s, z4.s, z22.s\n"
"zip1 z0.s, z0.s, z21.s\n"
"mov z1.q, z1.q[0]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z3.s, z3.s, z18.s\n"
+ "zip1 z3.s, z3.s, z24.s\n"
"mov z4.q, z4.q[0]\n"
"mov z24.s, #0x0\n"
"mov z25.s, #0x0\n"
- "sdot z24.s, z15.b, z1.b[0]\n"
+ "sdot z24.s, z13.b, z1.b[0]\n"
"mov z23.s, #0x0\n"
"mov z22.s, #0x0\n"
- "sdot z25.s, z15.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z1.b[1]\n"
"mov z21.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "sdot z23.s, z15.b, z1.b[2]\n"
- "mov z9.s, #0x0\n"
- "mov z8.s, #0x0\n"
- "sdot z22.s, z15.b, z1.b[3]\n"
"mov z19.s, #0x0\n"
+ "sdot z23.s, z13.b, z1.b[2]\n"
+ "mov z10.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "sdot z22.s, z13.b, z1.b[3]\n"
+ "mov z20.s, #0x0\n"
"mov z18.s, #0x0\n"
- "sdot z21.s, z15.b, z2.b[0]\n"
+ "sdot z21.s, z13.b, z2.b[0]\n"
"mov z17.s, #0x0\n"
"mov z16.s, #0x0\n"
- "sdot z20.s, z15.b, z2.b[1]\n"
- "sdot z9.s, z15.b, z2.b[2]\n"
- "sdot z8.s, z15.b, z2.b[3]\n"
+ "sdot z19.s, z13.b, z2.b[1]\n"
+ "sdot z10.s, z13.b, z2.b[2]\n"
+ "sdot z8.s, z13.b, z2.b[3]\n"
"mov z0.q, z0.q[0]\n"
- "sdot z19.s, z15.b, z4.b[0]\n"
- "sdot z18.s, z15.b, z4.b[1]\n"
+ "sdot z20.s, z13.b, z4.b[0]\n"
+ "sdot z18.s, z13.b, z4.b[1]\n"
"mov z3.q, z3.q[0]\n"
- "sdot z17.s, z15.b, z4.b[2]\n"
- "sdot z16.s, z15.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[2]\n"
+ "sdot z16.s, z13.b, z4.b[3]\n"
"mov z31.s, #0x0\n"
"mov z30.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "sdot z31.s, z15.b, z0.b[0]\n"
+ "mov z26.s, #0x0\n"
+ "sdot z31.s, z13.b, z0.b[0]\n"
+ "mov z27.s, #0x0\n"
"mov z28.s, #0x0\n"
- "sdot z30.s, z15.b, z0.b[1]\n"
- "sdot z29.s, z15.b, z0.b[2]\n"
- "sdot z28.s, z15.b, z0.b[3]\n"
+ "sdot z30.s, z13.b, z0.b[1]\n"
+ "mov z29.s, #0x0\n"
+ "sdot z26.s, z13.b, z0.b[2]\n"
+ "sdot z27.s, z13.b, z0.b[3]\n"
+ "sdot z28.s, z13.b, z3.b[0]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
"add z24.s, z24.s, z21.s\n"
- "add z25.s, z25.s, z20.s\n"
- "add z26.s, z23.s, z9.s\n"
- "add z27.s, z22.s, z8.s\n"
- "add z23.s, z19.s, z21.s\n"
- "mov z22.s, #0x0\n"
- "sdot z22.s, z15.b, z3.b[0]\n"
- "add z21.s, z18.s, z20.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z23.s, z23.s, z10.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "add z21.s, z20.s, z21.s\n"
"mov z20.s, #0x0\n"
- "sdot z20.s, z15.b, z3.b[1]\n"
- "add z19.s, z17.s, z9.s\n"
+ "sdot z20.s, z13.b, z3.b[2]\n"
+ "add z19.s, z18.s, z19.s\n"
"mov z18.s, #0x0\n"
- "sdot z18.s, z15.b, z3.b[2]\n"
- "add z17.s, z16.s, z8.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z15.b, z3.b[3]\n"
+ "sdot z18.s, z13.b, z3.b[3]\n"
+ "add z17.s, z17.s, z10.s\n"
+ "add z16.s, z16.s, z8.s\n"
"add z24.s, z24.s, z31.s\n"
"add z25.s, z25.s, z30.s\n"
- "mul z24.s, p2/M, z24.s, z14.s\n"
- "mul z25.s, p2/M, z25.s, z14.s\n"
- "add z26.s, z26.s, z29.s\n"
- "add z27.s, z27.s, z28.s\n"
- "mul z26.s, p2/M, z26.s, z14.s\n"
- "mul z27.s, p2/M, z27.s, z14.s\n"
- "add z28.s, z23.s, z22.s\n"
- "add z29.s, z21.s, z20.s\n"
- "mul z28.s, p2/M, z28.s, z14.s\n"
- "mul z29.s, p2/M, z29.s, z14.s\n"
- "add z30.s, z19.s, z18.s\n"
- "add z31.s, z17.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z14.s\n"
- "mul z31.s, p2/M, z31.s, z14.s\n"
+ "mul z24.s, p2/M, z24.s, z11.s\n"
+ "mul z25.s, p2/M, z25.s, z11.s\n"
+ "add z26.s, z23.s, z26.s\n"
+ "add z27.s, z22.s, z27.s\n"
+ "mul z26.s, p2/M, z26.s, z11.s\n"
+ "mul z27.s, p2/M, z27.s, z11.s\n"
+ "add z28.s, z21.s, z28.s\n"
+ "add z29.s, z19.s, z29.s\n"
+ "mul z28.s, p2/M, z28.s, z11.s\n"
+ "mul z29.s, p2/M, z29.s, z11.s\n"
+ "add z30.s, z17.s, z20.s\n"
+ "add z31.s, z16.s, z18.s\n"
+ "mul z30.s, p2/M, z30.s, z11.s\n"
+ "mul z31.s, p2/M, z31.s, z11.s\n"
"zip1 z19.s, z24.s, z26.s\n"
"zip1 z18.s, z25.s, z27.s\n"
"zip1 z17.s, z28.s, z30.s\n"
"zip1 z16.s, z29.s, z31.s\n"
"zip1 z22.s, z19.s, z18.s\n"
"zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
"1:" // Loop
"sdot z24.s, z5.b, z0.b[0]\n"
"sdot z25.s, z5.b, z0.b[1]\n"
- "ld1w { z21.s }, p2/Z, [%x[params]]\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"sdot z26.s, z5.b, z0.b[2]\n"
"sdot z27.s, z5.b, z0.b[3]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"sdot z24.s, z6.b, z1.b[0]\n"
"sdot z25.s, z6.b, z1.b[1]\n"
- "whilelt p1.b, x9, x10\n"
- "ld1w { z13.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
"sdot z26.s, z6.b, z1.b[2]\n"
"sdot z27.s, z6.b, z1.b[3]\n"
"sdot z28.s, z5.b, z2.b[0]\n"
"sdot z29.s, z5.b, z2.b[1]\n"
"sdot z30.s, z5.b, z2.b[2]\n"
"sdot z31.s, z5.b, z2.b[3]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"sdot z24.s, z7.b, z2.b[0]\n"
"sdot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x04b57718 // sqrdmulh z24.s, z24.s, z21.s\n"
+ ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
"sdot z26.s, z7.b, z2.b[2]\n"
"sdot z27.s, z7.b, z2.b[3]\n"
- ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
"sdot z28.s, z6.b, z3.b[0]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x04b5775a // sqrdmulh z26.s, z26.s, z21.s\n"
+ ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
"sdot z30.s, z6.b, z3.b[2]\n"
"sdot z31.s, z6.b, z3.b[3]\n"
- ".inst 0x04b5777b // sqrdmulh z27.s, z27.s, z21.s\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"sdot z28.s, z7.b, z4.b[0]\n"
"sdot z29.s, z7.b, z4.b[1]\n"
- "and z19.d, z24.d, z20.d\n"
+ "and z19.d, z24.d, z21.d\n"
"sdot z30.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z4.b[3]\n"
- "and z18.d, z25.d, z20.d\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "and z17.d, z26.d, z20.d\n"
- "and z16.d, z27.d, z20.d\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #6\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b5779c // sqrdmulh z28.s, z28.s, z21.s\n"
- ".inst 0x04b577bd // sqrdmulh z29.s, z29.s, z21.s\n"
- ".inst 0x04b577de // sqrdmulh z30.s, z30.s, z21.s\n"
- ".inst 0x04b577ff // sqrdmulh z31.s, z31.s, z21.s\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
+ ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
+ ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
"sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
- ".inst 0x44828a9b // srshl z27.s, p2/M, z27.s, z20.s\n"
- "and z19.d, z28.d, z20.d\n"
- "and z18.d, z29.d, z20.d\n"
- "and z17.d, z30.d, z20.d\n"
- "and z16.d, z31.d, z20.d\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
"sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
- ".inst 0x44828a9d // srshl z29.s, p2/M, z29.s, z20.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a9e // srshl z30.s, p2/M, z30.s, z20.s\n"
- ".inst 0x44828a9f // srshl z31.s, p2/M, z31.s, z20.s\n"
- "add z24.s, z24.s, z12.s\n"
- "add z25.s, z25.s, z12.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "smin z25.s, p2/M, z25.s, z10.s\n"
- "add z26.s, z26.s, z12.s\n"
- "add z27.s, z27.s, z12.s\n"
- "smin z26.s, p2/M, z26.s, z10.s\n"
- "smin z27.s, p2/M, z27.s, z10.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "smin z28.s, p2/M, z28.s, z10.s\n"
- "smin z29.s, p2/M, z29.s, z10.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
- "smin z30.s, p2/M, z30.s, z10.s\n"
- "smin z31.s, p2/M, z31.s, z10.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smax z25.s, p2/M, z25.s, z11.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z9.s\n"
+ "add z25.s, z25.s, z9.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "add z26.s, z26.s, z9.s\n"
+ "add z27.s, z27.s, z9.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "add z28.s, z28.s, z9.s\n"
+ "add z29.s, z29.s, z9.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "add z30.s, z30.s, z9.s\n"
+ "add z31.s, z31.s, z9.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "smin z31.s, p2/M, z31.s, z12.s\n"
+ "smax z24.s, p2/M, z24.s, z15.s\n"
+ "smax z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z11.s\n"
- "smax z27.s, p2/M, z27.s, z11.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z15.s\n"
+ "smax z27.s, p2/M, z27.s, z15.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z11.s\n"
- "smax z29.s, p2/M, z29.s, z11.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z15.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z11.s\n"
- "smax z31.s, p2/M, z31.s, z11.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z15.s\n"
+ "smax z31.s, p2/M, z31.s, z15.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z13.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z13.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z13.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 4874fb9a77..701948f264 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 2ed7cfc815..a3b2b429c0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -36,7 +36,7 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
const int8_t *const *const inptrs,
int8_t *const *const outptrs,
const void *params,
- const unsigned int n_output_channels,
+ unsigned int n_output_channels,
const arm_gemm::Requantize32& qp
)
{
@@ -47,8 +47,8 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ldr x21, [%x[inptrs], #0x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1b { z3.b }, p0/Z, [x22]\n"
- "mov z20.d, z3.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z23.d, z3.d\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
"ld1b { z4.b }, p0/Z, [x21]\n"
"ldr x24, [%x[inptrs], #0x8]\n"
"mov z18.d, z4.d\n"
@@ -59,132 +59,132 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ext z15.b, z15.b, z15.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x30]\n"
"ldr x21, [%x[inptrs], #0x38]\n"
- "zip1 z3.d, z3.d, z20.d\n"
+ "zip1 z3.d, z3.d, z23.d\n"
"zip1 z4.d, z4.d, z18.d\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1b { z1.b }, p0/Z, [x24]\n"
- "mov z20.d, z1.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z19.d, z1.d\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
"ld1b { z5.b }, p0/Z, [x23]\n"
"ld1b { z6.b }, p0/Z, [x22]\n"
- "mov z13.d, z5.d\n"
- "mov z19.d, z6.d\n"
+ "mov z18.d, z5.d\n"
+ "mov z22.d, z6.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
"ld1b { z0.b }, p0/Z, [x20]\n"
- "mov z25.d, z7.d\n"
+ "mov z8.d, z7.d\n"
"zip1 z2.d, z2.d, z15.d\n"
"mov z3.q, z3.q[0]\n"
"mov z4.q, z4.q[0]\n"
"ptrue p2.b\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ext z22.b, z22.b, z22.b, #0x1\n"
"lsl x10, %x[n_channels], #0x2\n"
"neg z23.s, p2/M, z23.s\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- "mov z30.b, #0x1\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "mov z28.b, #0x1\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
+ "mov z25.s, #0x0\n"
"mov z24.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "sdot z24.s, z30.b, z3.b[0]\n"
- "ld1w { z12.s }, p1/Z, [%x[params]]\n"
- "mov z18.s, #0x0\n"
+ "sdot z25.s, z28.b, z3.b[0]\n"
+ "ld1w { z12.s }, p0/Z, [%x[params]]\n"
"mov z17.s, #0x0\n"
- "sdot z28.s, z30.b, z3.b[2]\n"
+ "mov z16.s, #0x0\n"
+ "sdot z24.s, z28.b, z3.b[2]\n"
"mov x28, #0x0\n"
- "mov z16.d, z0.d\n"
- "sdot z18.s, z30.b, z4.b[0]\n"
- "sdot z17.s, z30.b, z4.b[2]\n"
+ "mov z27.d, z0.d\n"
+ "sdot z17.s, z28.b, z4.b[0]\n"
+ "sdot z16.s, z28.b, z4.b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "zip1 z1.d, z1.d, z20.d\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z5.d, z5.d, z13.d\n"
+ "zip1 z5.d, z5.d, z18.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z6.d, z6.d, z19.d\n"
- "zip1 z7.d, z7.d, z25.d\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z6.d, z6.d, z22.d\n"
+ "zip1 z7.d, z7.d, z8.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov z26.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "sdot z26.s, z30.b, z2.b[0]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "sdot z30.s, z28.b, z2.b[0]\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z29.s, #0x1\n"
- "sdot z22.s, z30.b, z2.b[2]\n"
- "sdot z24.s, z29.b, z3.b[1]\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z0.d, z0.d, z16.d\n"
+ "sdot z31.s, z28.b, z2.b[2]\n"
+ "sdot z25.s, z29.b, z3.b[1]\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z0.d, z0.d, z27.d\n"
"mov z1.q, z1.q[0]\n"
- "sdot z28.s, z29.b, z3.b[3]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z3.b[3]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "sdot z18.s, z29.b, z4.b[1]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z17.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"mov z7.q, z7.q[0]\n"
- "mov z21.s, #0x0\n"
- "sdot z17.s, z29.b, z4.b[3]\n"
+ "mov z22.s, #0x0\n"
+ "sdot z16.s, z29.b, z4.b[3]\n"
"addvl %x[params], %x[params], #5\n"
- "mov z20.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "sdot z21.s, z30.b, z1.b[0]\n"
+ "mov z21.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "sdot z22.s, z28.b, z1.b[0]\n"
"mov z27.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "sdot z21.s, z28.b, z1.b[2]\n"
"mov z19.s, #0x0\n"
- "sdot z20.s, z30.b, z1.b[2]\n"
- "sdot z25.s, z30.b, z5.b[0]\n"
- "sdot z27.s, z30.b, z5.b[2]\n"
- "mov z0.q, z0.q[0]\n"
- "sdot z19.s, z30.b, z6.b[0]\n"
- "sdot z26.s, z29.b, z2.b[1]\n"
- "add z24.s, z24.s, z18.s\n"
"mov z18.s, #0x0\n"
- "sdot z18.s, z30.b, z6.b[2]\n"
- "sdot z22.s, z29.b, z2.b[3]\n"
- "add z17.s, z28.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z7.b[0]\n"
- "sdot z21.s, z29.b, z1.b[1]\n"
- "sdot z20.s, z29.b, z1.b[3]\n"
- "add z28.s, z26.s, z24.s\n"
- "sdot z25.s, z29.b, z5.b[1]\n"
+ "sdot z26.s, z28.b, z5.b[0]\n"
+ "sdot z27.s, z28.b, z5.b[2]\n"
+ "sdot z20.s, z28.b, z6.b[0]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z19.s, z28.b, z6.b[2]\n"
+ "sdot z18.s, z28.b, z7.b[0]\n"
+ "add z17.s, z25.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "sdot z25.s, z28.b, z7.b[2]\n"
+ "sdot z30.s, z29.b, z2.b[1]\n"
+ "sdot z31.s, z29.b, z2.b[3]\n"
+ "add z16.s, z24.s, z16.s\n"
+ "sdot z22.s, z29.b, z1.b[1]\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z28.b, z0.b[0]\n"
+ "sdot z21.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z5.b[1]\n"
"sdot z27.s, z29.b, z5.b[3]\n"
- "add z31.s, z22.s, z17.s\n"
- "sdot z19.s, z29.b, z6.b[1]\n"
- "sdot z18.s, z29.b, z6.b[3]\n"
- "add z22.s, z21.s, z28.s\n"
- "sdot z16.s, z29.b, z7.b[1]\n"
- "add z21.s, z20.s, z31.s\n"
- "add z20.s, z25.s, z19.s\n"
- "add z19.s, z27.s, z18.s\n"
- "add z18.s, z16.s, z24.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z7.b[2]\n"
- "sdot z16.s, z29.b, z7.b[3]\n"
- "add z17.s, z16.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z0.b[0]\n"
- "sdot z16.s, z29.b, z0.b[1]\n"
- "add z24.s, z22.s, z16.s\n"
- "add z26.s, z22.s, z25.s\n"
+ "add z30.s, z30.s, z17.s\n"
+ "sdot z20.s, z29.b, z6.b[1]\n"
+ "sdot z19.s, z29.b, z6.b[3]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "sdot z18.s, z29.b, z7.b[1]\n"
+ "sdot z25.s, z29.b, z7.b[3]\n"
+ "add z22.s, z22.s, z30.s\n"
+ "sdot z24.s, z29.b, z0.b[1]\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z20.s, z26.s, z20.s\n"
+ "add z19.s, z27.s, z19.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "sdot z17.s, z28.b, z0.b[2]\n"
+ "sdot z17.s, z29.b, z0.b[3]\n"
+ "add z16.s, z25.s, z16.s\n"
+ "add z24.s, z22.s, z24.s\n"
+ "add z25.s, z21.s, z17.s\n"
"mul z24.s, p2/M, z24.s, z23.s\n"
- "mul z26.s, p2/M, z26.s, z23.s\n"
- "mov z16.s, #0x0\n"
- "sdot z16.s, z30.b, z0.b[2]\n"
- "sdot z16.s, z29.b, z0.b[3]\n"
- "add z25.s, z21.s, z16.s\n"
- "add z27.s, z21.s, z27.s\n"
"mul z25.s, p2/M, z25.s, z23.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
"mul z27.s, p2/M, z27.s, z23.s\n"
- "add z28.s, z20.s, z28.s\n"
+ "add z28.s, z20.s, z30.s\n"
"add z29.s, z19.s, z31.s\n"
"mul z28.s, p2/M, z28.s, z23.s\n"
"mul z29.s, p2/M, z29.s, z23.s\n"
- "add z30.s, z18.s, z20.s\n"
- "add z31.s, z17.s, z19.s\n"
+ "add z30.s, z20.s, z18.s\n"
+ "add z31.s, z19.s, z16.s\n"
"mul z30.s, p2/M, z30.s, z23.s\n"
"mul z31.s, p2/M, z31.s, z23.s\n"
"zip1 z19.s, z24.s, z26.s\n"
@@ -204,22 +204,22 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"1:" // Loop
"sdot z24.s, z8.b, z0.b[0]\n"
"sdot z25.s, z8.b, z0.b[2]\n"
- "ld1w { z17.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"sdot z26.s, z8.b, z1.b[0]\n"
"sdot z27.s, z8.b, z1.b[2]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"sdot z24.s, z9.b, z0.b[1]\n"
"sdot z25.s, z9.b, z0.b[3]\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
"sdot z26.s, z9.b, z1.b[1]\n"
"sdot z27.s, z9.b, z1.b[3]\n"
"sdot z28.s, z8.b, z2.b[0]\n"
"sdot z29.s, z8.b, z2.b[2]\n"
"sdot z30.s, z8.b, z3.b[0]\n"
"sdot z31.s, z8.b, z3.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
"sdot z24.s, z10.b, z1.b[0]\n"
"sdot z25.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
@@ -228,7 +228,7 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"sdot z29.s, z9.b, z2.b[3]\n"
"sdot z30.s, z9.b, z3.b[1]\n"
"sdot z31.s, z9.b, z3.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
"sdot z24.s, z11.b, z1.b[1]\n"
"sdot z25.s, z11.b, z1.b[3]\n"
"sdot z26.s, z11.b, z2.b[1]\n"
@@ -237,158 +237,158 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"sdot z29.s, z10.b, z3.b[2]\n"
"sdot z30.s, z10.b, z4.b[0]\n"
"sdot z31.s, z10.b, z4.b[2]\n"
- "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z25.s, z8.b, z2.b[2]\n"
- "sdot z26.s, z8.b, z3.b[0]\n"
- "sdot z27.s, z8.b, z3.b[2]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z24.s, z17.b, z2.b[0]\n"
+ "sdot z25.s, z17.b, z2.b[2]\n"
+ "sdot z26.s, z17.b, z3.b[0]\n"
+ "sdot z27.s, z17.b, z3.b[2]\n"
"sdot z28.s, z11.b, z3.b[1]\n"
"sdot z29.s, z11.b, z3.b[3]\n"
"sdot z30.s, z11.b, z4.b[1]\n"
"sdot z31.s, z11.b, z4.b[3]\n"
- "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z24.s, z9.b, z2.b[1]\n"
- "sdot z25.s, z9.b, z2.b[3]\n"
- "sdot z26.s, z9.b, z3.b[1]\n"
- "sdot z27.s, z9.b, z3.b[3]\n"
- "sdot z28.s, z8.b, z4.b[0]\n"
- "sdot z29.s, z8.b, z4.b[2]\n"
- "sdot z30.s, z8.b, z5.b[0]\n"
- "sdot z31.s, z8.b, z5.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "sdot z24.s, z10.b, z3.b[0]\n"
- "sdot z25.s, z10.b, z3.b[2]\n"
- "sdot z26.s, z10.b, z4.b[0]\n"
- "sdot z27.s, z10.b, z4.b[2]\n"
- "sdot z28.s, z9.b, z4.b[1]\n"
- "sdot z29.s, z9.b, z4.b[3]\n"
- "sdot z30.s, z9.b, z5.b[1]\n"
- "sdot z31.s, z9.b, z5.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z16.b, z2.b[1]\n"
+ "sdot z25.s, z16.b, z2.b[3]\n"
+ "sdot z26.s, z16.b, z3.b[1]\n"
+ "sdot z27.s, z16.b, z3.b[3]\n"
+ "sdot z28.s, z17.b, z4.b[0]\n"
+ "sdot z29.s, z17.b, z4.b[2]\n"
+ "sdot z30.s, z17.b, z5.b[0]\n"
+ "sdot z31.s, z17.b, z5.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z24.s, z19.b, z3.b[0]\n"
+ "sdot z25.s, z19.b, z3.b[2]\n"
+ "sdot z26.s, z19.b, z4.b[0]\n"
+ "sdot z27.s, z19.b, z4.b[2]\n"
+ "sdot z28.s, z16.b, z4.b[1]\n"
+ "sdot z29.s, z16.b, z4.b[3]\n"
+ "sdot z30.s, z16.b, z5.b[1]\n"
+ "sdot z31.s, z16.b, z5.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z24.s, z11.b, z3.b[1]\n"
- "sdot z25.s, z11.b, z3.b[3]\n"
- "ld1w { z12.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "sdot z26.s, z11.b, z4.b[1]\n"
- "sdot z27.s, z11.b, z4.b[3]\n"
- "sdot z28.s, z10.b, z5.b[0]\n"
- "sdot z29.s, z10.b, z5.b[2]\n"
- "sdot z30.s, z10.b, z6.b[0]\n"
- "sdot z31.s, z10.b, z6.b[2]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z24.s, z8.b, z4.b[0]\n"
- "sdot z25.s, z8.b, z4.b[2]\n"
- "sdot z26.s, z8.b, z5.b[0]\n"
- "sdot z27.s, z8.b, z5.b[2]\n"
- "sdot z28.s, z11.b, z5.b[1]\n"
- "sdot z29.s, z11.b, z5.b[3]\n"
- "sdot z30.s, z11.b, z6.b[1]\n"
- "sdot z31.s, z11.b, z6.b[3]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sdot z24.s, z9.b, z4.b[1]\n"
- "sdot z25.s, z9.b, z4.b[3]\n"
- ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
- "sdot z26.s, z9.b, z5.b[1]\n"
- "sdot z27.s, z9.b, z5.b[3]\n"
- ".inst 0x04b17739 // sqrdmulh z25.s, z25.s, z17.s\n"
- "sdot z28.s, z8.b, z6.b[0]\n"
- "sdot z29.s, z8.b, z6.b[2]\n"
- ".inst 0x04b1775a // sqrdmulh z26.s, z26.s, z17.s\n"
- "sdot z30.s, z8.b, z7.b[0]\n"
- "sdot z31.s, z8.b, z7.b[2]\n"
- ".inst 0x04b1777b // sqrdmulh z27.s, z27.s, z17.s\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "sdot z28.s, z9.b, z6.b[1]\n"
- "sdot z29.s, z9.b, z6.b[3]\n"
- "and z16.d, z24.d, z19.d\n"
- "sdot z30.s, z9.b, z7.b[1]\n"
- "sdot z31.s, z9.b, z7.b[3]\n"
- "and z18.d, z25.d, z19.d\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ "sdot z24.s, z18.b, z3.b[1]\n"
+ "sdot z25.s, z18.b, z3.b[3]\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "sdot z26.s, z18.b, z4.b[1]\n"
+ "sdot z27.s, z18.b, z4.b[3]\n"
+ "sdot z28.s, z19.b, z5.b[0]\n"
+ "sdot z29.s, z19.b, z5.b[2]\n"
+ "sdot z30.s, z19.b, z6.b[0]\n"
+ "sdot z31.s, z19.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z24.s, z17.b, z4.b[0]\n"
+ "sdot z25.s, z17.b, z4.b[2]\n"
+ "sdot z26.s, z17.b, z5.b[0]\n"
+ "sdot z27.s, z17.b, z5.b[2]\n"
+ "sdot z28.s, z18.b, z5.b[1]\n"
+ "sdot z29.s, z18.b, z5.b[3]\n"
+ "sdot z30.s, z18.b, z6.b[1]\n"
+ "sdot z31.s, z18.b, z6.b[3]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+ "sdot z24.s, z16.b, z4.b[1]\n"
+ "sdot z25.s, z16.b, z4.b[3]\n"
+ ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
+ "sdot z26.s, z16.b, z5.b[1]\n"
+ "sdot z27.s, z16.b, z5.b[3]\n"
+ ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "sdot z28.s, z17.b, z6.b[0]\n"
+ "sdot z29.s, z17.b, z6.b[2]\n"
+ ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
+ "sdot z30.s, z17.b, z7.b[0]\n"
+ "sdot z31.s, z17.b, z7.b[2]\n"
+ ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+ "sdot z28.s, z16.b, z6.b[1]\n"
+ "sdot z29.s, z16.b, z6.b[3]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "sdot z30.s, z16.b, z7.b[1]\n"
+ "sdot z31.s, z16.b, z7.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #-3\n"
- ".inst 0x04b1779c // sqrdmulh z28.s, z28.s, z17.s\n"
- ".inst 0x04b177bd // sqrdmulh z29.s, z29.s, z17.s\n"
- ".inst 0x04b177de // sqrdmulh z30.s, z30.s, z17.s\n"
- ".inst 0x04b177ff // sqrdmulh z31.s, z31.s, z17.s\n"
- "and z17.d, z26.d, z19.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z24.s, z24.s, z16.s\n"
- "and z16.d, z27.d, z19.d\n"
- ".inst 0x44828a78 // srshl z24.s, p2/M, z24.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
+ ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
+ ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
+ ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a79 // srshl z25.s, p2/M, z25.s, z19.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a7a // srshl z26.s, p2/M, z26.s, z19.s\n"
- ".inst 0x44828a7b // srshl z27.s, p2/M, z27.s, z19.s\n"
- "and z16.d, z28.d, z19.d\n"
- "and z18.d, z29.d, z19.d\n"
- "and z17.d, z30.d, z19.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z28.s, z28.s, z16.s\n"
- "and z16.d, z31.d, z19.d\n"
- ".inst 0x44828a7c // srshl z28.s, p2/M, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a7d // srshl z29.s, p2/M, z29.s, z19.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a7e // srshl z30.s, p2/M, z30.s, z19.s\n"
- ".inst 0x44828a7f // srshl z31.s, p2/M, z31.s, z19.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
"smin z24.s, p2/M, z24.s, z15.s\n"
"smin z25.s, p2/M, z25.s, z15.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
"smin z26.s, p2/M, z26.s, z15.s\n"
"smin z27.s, p2/M, z27.s, z15.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
"smin z28.s, p2/M, z28.s, z15.s\n"
"smin z29.s, p2/M, z29.s, z15.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"smin z30.s, p2/M, z30.s, z15.s\n"
"smin z31.s, p2/M, z31.s, z15.s\n"
- "smax z24.s, p2/M, z24.s, z13.s\n"
- "smax z25.s, p2/M, z25.s, z13.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ "smax z24.s, p2/M, z24.s, z14.s\n"
+ "smax z25.s, p2/M, z25.s, z14.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z13.s\n"
- "smax z27.s, p2/M, z27.s, z13.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z14.s\n"
+ "smax z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z13.s\n"
- "smax z29.s, p2/M, z29.s, z13.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z14.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z13.s\n"
- "smax z31.s, p2/M, z31.s, z13.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z14.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z12.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z12.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z12.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z12.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 0d185fcafc..1730574933 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 6a432e1961..d9c8644fc4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,411 +30,403 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const int8_t *const *const inptrs,
- const int8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- int8_t *const *const outptrs
-)
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
"mov x13, #0x0\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ptrue p1.b\n"
- "mov x24, #0x0\n"
- "ldp x23, x22, [%x[outptrs], #0x0]\n"
- "ldp x21, x20, [%x[outptrs], #0x10]\n"
- "ld1b { z9.b }, p2/Z, [x12, x13]\n"
- "ld1b { z8.b }, p2/Z, [x11, x13]\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ld1b { z7.b }, p2/Z, [x10, x13]\n"
- "zip2 z6.b, z9.b, z7.b\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "ld1b { z5.b }, p2/Z, [x9, x13]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip1 z7.b, z8.b, z5.b\n"
- "zip2 z5.b, z8.b, z5.b\n"
- "ld1b { z4.b }, p2/Z, [x28, x13]\n"
- "ld1b { z3.b }, p2/Z, [x27, x13]\n"
- "zip2 z8.b, z9.b, z7.b\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ld1b { z2.b }, p2/Z, [x26, x13]\n"
- "zip1 z7.b, z6.b, z5.b\n"
- "zip2 z5.b, z6.b, z5.b\n"
- "ld1b { z1.b }, p2/Z, [x25, x13]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip2 z0.b, z4.b, z2.b\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "ld1b { z31.b }, p2/Z, [x12, x13]\n"
- "ld1b { z30.b }, p2/Z, [x11, x13]\n"
- "zip1 z2.b, z3.b, z1.b\n"
- "zip2 z1.b, z3.b, z1.b\n"
- "ld1b { z29.b }, p2/Z, [x10, x13]\n"
- "ld1b { z28.b }, p2/Z, [x9, x13]\n"
- "zip2 z27.b, z31.b, z29.b\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "ld1b { z26.b }, p2/Z, [x28, x13]\n"
- "ld1b { z25.b }, p2/Z, [x27, x13]\n"
- "zip1 z29.b, z30.b, z28.b\n"
- "zip2 z28.b, z30.b, z28.b\n"
- "ld1b { z24.b }, p2/Z, [x26, x13]\n"
- "ld1b { z23.b }, p2/Z, [x25, x13]\n"
- "zip2 z22.b, z26.b, z24.b\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "zip1 z24.b, z25.b, z23.b\n"
- "zip2 z23.b, z25.b, z23.b\n"
- "ld1w { z6.s }, p1/Z, [%x[params]]\n"
- "ld1rw { z21.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z20.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z19.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip2 z3.b, z4.b, z2.b\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "zip1 z2.b, z0.b, z1.b\n"
- "zip2 z1.b, z0.b, z1.b\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip2 z30.b, z31.b, z29.b\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "zip1 z29.b, z27.b, z28.b\n"
- "zip2 z28.b, z27.b, z28.b\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "zip2 z25.b, z26.b, z24.b\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "whilelt p0.b, x13, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ptrue p2.b\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
+ "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+ "ld1b { z18.b }, p0/Z, [x26, x13]\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+ "zip2 z17.b, z15.b, z16.b\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "ld1b { z14.b }, p0/Z, [x24, x13]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z16.b, z18.b, z14.b\n"
+ "zip2 z14.b, z18.b, z14.b\n"
+ "ld1b { z13.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z18.b }, p0/Z, [x22, x13]\n"
+ "zip2 z12.b, z15.b, z16.b\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+ "zip1 z11.b, z17.b, z14.b\n"
+ "zip2 z14.b, z17.b, z14.b\n"
+ "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z22.b, z13.b, z16.b\n"
+ "zip1 z13.b, z13.b, z16.b\n"
+ "ld1b { z9.b }, p0/Z, [x27, x13]\n"
+ "ld1b { z17.b }, p0/Z, [x26, x13]\n"
+ "zip1 z21.b, z18.b, z10.b\n"
+ "zip2 z10.b, z18.b, z10.b\n"
+ "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+ "ld1b { z8.b }, p0/Z, [x24, x13]\n"
+ "zip2 z20.b, z9.b, z16.b\n"
+ "zip1 z9.b, z9.b, z16.b\n"
+ "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x13]\n"
+ "zip1 z18.b, z17.b, z8.b\n"
+ "zip2 z8.b, z17.b, z8.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+ "zip2 z17.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "zip1 z16.b, z19.b, z6.b\n"
+ "zip2 z6.b, z19.b, z6.b\n"
+ "ld1w { z5.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z2.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip2 z1.b, z13.b, z21.b\n"
+ "zip1 z13.b, z13.b, z21.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip1 z0.b, z22.b, z10.b\n"
+ "zip2 z10.b, z22.b, z10.b\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip2 z31.b, z9.b, z18.b\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z30.b, z20.b, z8.b\n"
+ "zip2 z8.b, z20.b, z8.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip2 z27.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
- "zip1 z24.b, z22.b, z23.b\n"
- "zip2 z23.b, z22.b, z23.b\n"
- "mov z0.d, z6.d\n"
- "mov z27.d, z6.d\n"
- "mov z22.d, z6.d\n"
+ "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z6.b, z17.b, z6.b\n"
+ "mov z24.d, z5.d\n"
+ "mov z22.d, z5.d\n"
+ "mov z21.d, z5.d\n"
"1:" // Loop
- "sdot z6.s, z18.b, z9.b\n"
- "sdot z27.s, z18.b, z4.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z17.b, z4.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "sdot z0.s, z18.b, z9.b\n"
- "ld1w { z9.s }, p1/Z, [%x[params]]\n"
- "sdot z22.s, z18.b, z4.b\n"
- "sdot z27.s, z17.b, z31.b\n"
+ "sdot z5.s, z29.b, z15.b\n"
+ "sdot z22.s, z29.b, z13.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z5.s, z28.b, z13.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "sdot z24.s, z29.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [%x[params]]\n"
+ "sdot z21.s, z29.b, z13.b\n"
+ "sdot z22.s, z28.b, z9.b\n"
"incw x13, ALL, MUL #4\n"
- "sdot z6.s, z16.b, z31.b\n"
- "ext z31.b, z31.b, z31.b, #0x1\n"
- "sdot z0.s, z17.b, z4.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "sdot z22.s, z17.b, z31.b\n"
- "sdot z27.s, z16.b, z26.b\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z31.b\n"
- "sdot z22.s, z16.b, z26.b\n"
- "and z18.d, z6.d, z4.d\n"
+ "sdot z5.s, z26.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "sdot z24.s, z28.b, z13.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z21.s, z28.b, z9.b\n"
+ "sdot z22.s, z26.b, z7.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
+ "sdot z24.s, z26.b, z9.b\n"
+ "sdot z21.s, z26.b, z7.b\n"
+ "and z16.d, z5.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ "sqadd z5.s, z5.s, z16.s\n"
+ ".inst 0x44828a85 // srshl z5.s, p2/M, z5.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1w { z9.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z0.s, z0.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "add z22.s, z22.s, z19.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "mov z0.d, z6.d\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "mov z27.d, z6.d\n"
- "sdot z27.s, z18.b, z3.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "mov z22.d, z6.d\n"
- "sdot z6.s, z18.b, z8.b\n"
- "sdot z6.s, z17.b, z3.b\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "sdot z0.s, z18.b, z8.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #7, MUL VL]\n"
- "sdot z22.s, z18.b, z3.b\n"
- "sdot z27.s, z17.b, z30.b\n"
- "incw x24\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z16.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z0.s, z17.b, z3.b\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z5.s, z5.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z5.s, p2/M, z5.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z5.s, p2/M, z5.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z5.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z24.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z22.d, z23.d\n"
+ "sdot z22.s, z18.b, z1.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z21.d, z23.d\n"
+ "sdot z23.s, z18.b, z12.b\n"
+ "sdot z23.s, z17.b, z1.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z24.s, z18.b, z12.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z21.s, z18.b, z1.b\n"
+ "sdot z22.s, z17.b, z31.b\n"
+ "incw x12\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z31.b\n"
+ "ext z31.b, z31.b, z31.b, #0x1\n"
+ "sdot z24.s, z17.b, z1.b\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z22.s, z17.b, z30.b\n"
- "sdot z27.s, z16.b, z25.b\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z30.b\n"
- "sdot z22.s, z16.b, z25.b\n"
- "and z18.d, z6.d, z4.d\n"
+ "sdot z21.s, z17.b, z31.b\n"
+ "sdot z22.s, z16.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z24.s, z16.b, z31.b\n"
+ "sdot z21.s, z16.b, z27.b\n"
+ "and z16.d, z23.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1w { z9.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z23.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z24.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z22.d, z23.d\n"
+ "sdot z22.s, z18.b, z0.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z21.d, z23.d\n"
+ "sdot z23.s, z18.b, z11.b\n"
+ "sdot z23.s, z17.b, z0.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z24.s, z18.b, z11.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z21.s, z18.b, z0.b\n"
+ "sdot z22.s, z17.b, z30.b\n"
+ "incw x12\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z24.s, z17.b, z0.b\n"
+ "sdot z21.s, z17.b, z30.b\n"
+ "sdot z22.s, z16.b, z25.b\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z24.s, z16.b, z30.b\n"
+ "sdot z21.s, z16.b, z25.b\n"
+ "and z16.d, z23.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z0.s, z0.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "add z22.s, z22.s, z19.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "mov z0.d, z6.d\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "mov z27.d, z6.d\n"
- "sdot z27.s, z18.b, z2.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "mov z22.d, z6.d\n"
- "sdot z6.s, z18.b, z7.b\n"
- "sdot z6.s, z17.b, z2.b\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z0.s, z18.b, z7.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z22.s, z18.b, z2.b\n"
- "sdot z27.s, z17.b, z29.b\n"
- "incw x24\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z16.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "sdot z0.s, z17.b, z2.b\n"
- "sdot z22.s, z17.b, z29.b\n"
- "sdot z27.s, z16.b, z24.b\n"
- "ext z24.b, z24.b, z24.b, #0x1\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z29.b\n"
- "sdot z22.s, z16.b, z24.b\n"
- "and z18.d, z6.d, z4.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1w { z9.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z0.s, z0.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "add z22.s, z22.s, z19.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "mov z0.d, z6.d\n"
- "ld1b { z17.b }, p1/Z, [%x[params]]\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "mov z27.d, z6.d\n"
- "sdot z27.s, z18.b, z1.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "mov z22.d, z6.d\n"
- "sdot z6.s, z18.b, z5.b\n"
- "sdot z6.s, z17.b, z1.b\n"
- "ext z5.b, z5.b, z5.b, #0x1\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z0.s, z18.b, z5.b\n"
- "ld1w { z4.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "sdot z22.s, z18.b, z1.b\n"
- "sdot z27.s, z17.b, z28.b\n"
- "incw x24\n"
- "whilelt p0.s, x24, %x[n_channels]\n"
- "sdot z6.s, z16.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "sdot z0.s, z17.b, z1.b\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "sdot z22.s, z17.b, z28.b\n"
- "sdot z27.s, z16.b, z23.b\n"
- "ext z23.b, z23.b, z23.b, #0x1\n"
- "ld1b { z8.b }, p2/Z, [x11, x13]\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sdot z0.s, z16.b, z28.b\n"
- "sdot z22.s, z16.b, z23.b\n"
- "ld1b { z7.b }, p2/Z, [x10, x13]\n"
- "and z18.d, z6.d, z4.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "ld1b { z5.b }, p2/Z, [x9, x13]\n"
- "ld1b { z3.b }, p2/Z, [x27, x13]\n"
- ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
- ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
- "ld1b { z2.b }, p2/Z, [x26, x13]\n"
- "ld1b { z1.b }, p2/Z, [x25, x13]\n"
- ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
- "ld1b { z9.b }, p2/Z, [x12, x13]\n"
- "and z17.d, z0.d, z4.d\n"
- "and z16.d, z27.d, z4.d\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "and z18.d, z22.d, z4.d\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z23.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z29.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z28.d, z23.d\n"
+ "sdot z28.s, z18.b, z10.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z27.d, z23.d\n"
+ "sdot z23.s, z18.b, z14.b\n"
+ "sdot z23.s, z17.b, z10.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
+ "sdot z29.s, z18.b, z14.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z27.s, z18.b, z10.b\n"
+ "sdot z28.s, z17.b, z8.b\n"
+ "incw x12\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z8.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "sdot z29.s, z17.b, z10.b\n"
+ "whilelt p0.b, x13, %x[n_channels]\n"
+ "sdot z27.s, z17.b, z8.b\n"
+ "sdot z28.s, z16.b, z6.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "ld1b { z26.b }, p0/Z, [x26, x13]\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z29.s, z16.b, z8.b\n"
+ "sdot z27.s, z16.b, z6.b\n"
+ "ld1b { z21.b }, p0/Z, [x25, x13]\n"
+ "and z16.d, z23.d, z22.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "ld1b { z14.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x13]\n"
+ ".inst 0x04b377bd // sqrdmulh z29.s, z29.s, z19.s\n"
+ ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
+ "ld1b { z20.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ ".inst 0x04b3777b // sqrdmulh z27.s, z27.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+ "and z19.d, z29.d, z22.d\n"
+ "and z17.d, z28.d, z22.d\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z16.d, z27.d, z22.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "ld1b { z9.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x13]\n"
"asr z17.s, z17.s, #0x1f\n"
- "ld1b { z31.b }, p2/Z, [x12, x13]\n"
- "ld1b { z30.b }, p2/Z, [x11, x13]\n"
"asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "ld1b { z29.b }, p2/Z, [x10, x13]\n"
- "ld1b { z28.b }, p2/Z, [x9, x13]\n"
- "sqadd z0.s, z0.s, z17.s\n"
+ "ld1b { z18.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z8.b }, p0/Z, [x20, x13]\n"
+ "sqadd z29.s, z29.s, z19.s\n"
+ "sqadd z28.s, z28.s, z17.s\n"
+ ".inst 0x44828add // srshl z29.s, p2/M, z29.s, z22.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
- ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- "add z6.s, z6.s, z19.s\n"
- ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
- "smax z6.s, p1/M, z6.s, z21.s\n"
- "add z0.s, z0.s, z19.s\n"
- "add z27.s, z27.s, z19.s\n"
- "ld1b { z4.b }, p2/Z, [x28, x13]\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "add z22.s, z22.s, z19.s\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "smin z6.s, p1/M, z6.s, z20.s\n"
- "smax z0.s, p1/M, z0.s, z21.s\n"
- "smax z27.s, p1/M, z27.s, z21.s\n"
- "smax z22.s, p1/M, z22.s, z21.s\n"
- "st1b { z6.s }, p0, [x23, x24]\n"
- "ld1b { z26.b }, p2/Z, [x28, x13]\n"
- "ld1b { z25.b }, p2/Z, [x27, x13]\n"
- "ld1b { z24.b }, p2/Z, [x26, x13]\n"
- "zip2 z6.b, z9.b, z7.b\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "ld1b { z23.b }, p2/Z, [x25, x13]\n"
- "zip1 z7.b, z8.b, z5.b\n"
- "zip2 z5.b, z8.b, z5.b\n"
- "smin z0.s, p1/M, z0.s, z20.s\n"
- "smin z27.s, p1/M, z27.s, z20.s\n"
- "smin z22.s, p1/M, z22.s, z20.s\n"
- "st1b { z0.s }, p0, [x22, x24]\n"
- "zip2 z8.b, z9.b, z7.b\n"
- "st1b { z27.s }, p0, [x21, x24]\n"
- "zip1 z9.b, z9.b, z7.b\n"
- "zip1 z7.b, z6.b, z5.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "st1b { z22.s }, p0, [x20, x24]\n"
- "zip2 z5.b, z6.b, z5.b\n"
- "zip2 z0.b, z4.b, z2.b\n"
- "ld1w { z6.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "zip1 z2.b, z3.b, z1.b\n"
- "incw x24\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "zip2 z1.b, z3.b, z1.b\n"
- "zip2 z27.b, z31.b, z29.b\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "zip1 z29.b, z30.b, z28.b\n"
- "ld1b { z18.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "ld1b { z17.b }, p1/Z, [%x[params], #6, MUL VL]\n"
- "zip2 z28.b, z30.b, z28.b\n"
- "zip2 z22.b, z26.b, z24.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "ld1b { z13.b }, p0/Z, [x24, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "add z27.s, z27.s, z2.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z29.s, p2/M, z29.s, z4.s\n"
+ "smax z28.s, p2/M, z28.s, z4.s\n"
+ "smax z27.s, p2/M, z27.s, z4.s\n"
+ "st1b { z23.s }, p1, [x11, x12]\n"
+ "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z23.b }, p0/Z, [x22, x13]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x13]\n"
+ "zip2 z17.b, z15.b, z21.b\n"
+ "zip1 z15.b, z15.b, z21.b\n"
+ "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+ "zip1 z16.b, z26.b, z14.b\n"
+ "zip2 z14.b, z26.b, z14.b\n"
+ "smin z29.s, p2/M, z29.s, z3.s\n"
+ "smin z28.s, p2/M, z28.s, z3.s\n"
+ "smin z27.s, p2/M, z27.s, z3.s\n"
+ "st1b { z29.s }, p1, [x10, x12]\n"
+ "zip2 z12.b, z15.b, z16.b\n"
+ "st1b { z28.s }, p1, [x9, x12]\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "zip1 z11.b, z17.b, z14.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z27.s }, p1, [x28, x12]\n"
+ "zip2 z14.b, z17.b, z14.b\n"
+ "zip2 z21.b, z13.b, z20.b\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z13.b, z13.b, z20.b\n"
+ "zip1 z20.b, z25.b, z10.b\n"
+ "incw x12\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip2 z10.b, z25.b, z10.b\n"
+ "zip2 z19.b, z9.b, z18.b\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z18.b, z24.b, z8.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "zip2 z8.b, z24.b, z8.b\n"
+ "zip2 z17.b, z7.b, z22.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "zip1 z24.b, z25.b, z23.b\n"
- "zip2 z23.b, z25.b, z23.b\n"
- "zip2 z3.b, z4.b, z2.b\n"
- "zip1 z4.b, z4.b, z2.b\n"
- "zip1 z2.b, z0.b, z1.b\n"
- "zip2 z1.b, z0.b, z1.b\n"
- "zip2 z30.b, z31.b, z29.b\n"
- "zip1 z31.b, z31.b, z29.b\n"
- "zip1 z29.b, z27.b, z28.b\n"
- "zip2 z28.b, z27.b, z28.b\n"
- "zip2 z25.b, z26.b, z24.b\n"
- "zip1 z26.b, z26.b, z24.b\n"
- "zip1 z24.b, z22.b, z23.b\n"
- "zip2 z23.b, z22.b, z23.b\n"
- "mov z0.d, z6.d\n"
- "mov z27.d, z6.d\n"
- "mov z22.d, z6.d\n"
+ "zip1 z7.b, z7.b, z22.b\n"
+ "zip1 z16.b, z23.b, z6.b\n"
+ "zip2 z6.b, z23.b, z6.b\n"
+ "zip2 z1.b, z13.b, z20.b\n"
+ "zip1 z13.b, z13.b, z20.b\n"
+ "zip1 z0.b, z21.b, z10.b\n"
+ "zip2 z10.b, z21.b, z10.b\n"
+ "zip2 z31.b, z9.b, z18.b\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z30.b, z19.b, z8.b\n"
+ "zip2 z8.b, z19.b, z8.b\n"
+ "zip2 z27.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z6.b, z17.b, z6.b\n"
+ "mov z24.d, z5.d\n"
+ "mov z22.d, z5.d\n"
+ "mov z21.d, z5.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 648b2da163..9432cd7550 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 257c4d44dc..f0860c98b9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -30,472 +30,464 @@
namespace arm_conv {
namespace depthwise {
-void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
- const unsigned int n_channels,
- const uint8_t *const *const inptrs,
- const uint8_t *params,
- const int32_t *, // Bias, should be wrapped into the parameters
- const arm_gemm::Requantize32& qp,
- const int32_t *, const int32_t *, // Requant parameters, also wrapped
- uint8_t *const *const outptrs
-)
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x13, #0x0\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x13, x21, [%x[inptrs], #0x30]\n"
"mov x20, #0x1\n"
- "ptrue p1.b\n"
- "ldp x24, x23, [%x[outptrs], #0x0]\n"
- "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "ptrue p2.b\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
"orr x20, x20, #0x100\n"
"orr x20, x20, #0x10000\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "dup z12.s, w20\n"
- "mov x20, #0x0\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params]]\n"
- "ld1rw { z25.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z24.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z23.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "ld1rw { z22.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "dup z25.s, w20\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+ "zip2 z16.b, z15.b, z31.b\n"
+ "zip1 z15.b, z15.b, z31.b\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z30.b, z21.b, z29.b\n"
+ "zip2 z29.b, z21.b, z29.b\n"
+ "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+ "zip2 z13.b, z15.b, z30.b\n"
+ "zip1 z15.b, z15.b, z30.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+ "zip1 z14.b, z16.b, z29.b\n"
+ "zip2 z29.b, z16.b, z29.b\n"
+ "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z31.b, z9.b, z5.b\n"
+ "zip1 z9.b, z9.b, z5.b\n"
+ "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+ "zip1 z21.b, z20.b, z17.b\n"
+ "zip2 z17.b, z20.b, z17.b\n"
+ "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+ "zip2 z23.b, z18.b, z6.b\n"
+ "zip1 z18.b, z18.b, z6.b\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+ "zip1 z24.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z22.b, z2.b, z16.b\n"
+ "zip1 z2.b, z2.b, z16.b\n"
+ "zip1 z0.b, z19.b, z5.b\n"
+ "zip2 z5.b, z19.b, z5.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z19.b, z9.b, z21.b\n"
+ "zip1 z9.b, z9.b, z21.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "zip1 z11.b, z31.b, z17.b\n"
+ "zip2 z17.b, z31.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z12.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z20.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z24.b, z2.b, z0.b\n"
+ "zip1 z2.b, z2.b, z0.b\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z0.b, z22.b, z5.b\n"
+ "zip2 z5.b, z22.b, z5.b\n"
"addvl %x[params], %x[params], #4\n"
- "mov z4.d, z10.d\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"1:" // Loop
- "mov z19.s, #0x0\n"
- "udot z19.s, z12.b, z8.b\n"
- "udot z10.s, z21.b, z14.b\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z19.s, z12.b, z3.b\n"
- "udot z31.s, z21.b, z8.b\n"
- "incw x13, ALL, MUL #4\n"
- "udot z10.s, z16.b, z8.b\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z30.b\n"
- "udot z19.s, z12.b, z14.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "udot z31.s, z16.b, z3.b\n"
- "udot z10.s, z20.b, z3.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "udot z4.s, z21.b, z14.b\n"
- "udot z26.s, z21.b, z8.b\n"
- "mov z17.s, #0x0\n"
- "udot z17.s, z12.b, z8.b\n"
- "udot z17.s, z12.b, z3.b\n"
- "udot z31.s, z20.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "udot z4.s, z16.b, z8.b\n"
- "udot z26.s, z16.b, z3.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z30.b\n"
- "mov z19.s, #0x0\n"
- "udot z17.s, z12.b, z14.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params]]\n"
- "udot z4.s, z20.b, z3.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "udot z26.s, z20.b, z30.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- "udot z19.s, z12.b, z7.b\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "sqadd z10.s, z10.s, z21.s\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "udot z19.s, z12.b, z2.b\n"
- "and z16.d, z4.d, z8.d\n"
- "and z20.d, z31.d, z8.d\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z29.b\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #6, MUL VL]\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "udot z19.s, z12.b, z13.b\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- "sqadd z31.s, z31.s, z20.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- "add z10.s, z10.s, z22.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "add z31.s, z31.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "udot z31.s, z21.b, z7.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "udot z10.s, z21.b, z13.b\n"
- "udot z10.s, z16.b, z7.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- "udot z4.s, z21.b, z13.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #7, MUL VL]\n"
- "mov z17.s, #0x0\n"
- "udot z26.s, z21.b, z7.b\n"
- "udot z17.s, z12.b, z7.b\n"
- "incw x20\n"
- "udot z31.s, z16.b, z2.b\n"
- "udot z10.s, z20.b, z2.b\n"
+ "mov z30.s, #0x0\n"
+ "udot z30.s, z25.b, z9.b\n"
+ "udot z10.s, z26.b, z15.b\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z30.s, z25.b, z18.b\n"
+ "udot z31.s, z26.b, z9.b\n"
+ "mov z27.s, #0x0\n"
+ "incw x14, ALL, MUL #4\n"
+ "udot z10.s, z3.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "movprfx z28, z30\n udot z28.s, z25.b, z2.b\n"
+ "udot z30.s, z25.b, z15.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "udot z27.s, z25.b, z9.b\n"
+ "udot z31.s, z3.b, z18.b\n"
+ "udot z10.s, z1.b, z18.b\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "udot z22.s, z26.b, z15.b\n"
+ "udot z21.s, z26.b, z9.b\n"
+ "udot z27.s, z25.b, z18.b\n"
+ "udot z31.s, z1.b, z2.b\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z4.s, z16.b, z7.b\n"
- "udot z26.s, z16.b, z2.b\n"
+ "udot z22.s, z3.b, z9.b\n"
+ "udot z21.s, z3.b, z18.b\n"
+ "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p2/M, z30.s, z8.s\n"
+ "movprfx z26, z27\n udot z26.s, z25.b, z2.b\n"
+ "mov z9.s, #0x0\n"
+ "udot z27.s, z25.b, z15.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "udot z22.s, z1.b, z18.b\n"
+ ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
+ "udot z21.s, z1.b, z2.b\n"
+ "mls z22.s, p2/M, z27.s, z8.s\n"
+ "and z18.d, z10.d, z3.d\n"
+ "mls z31.s, p2/M, z28.s, z8.s\n"
+ "mls z21.s, p2/M, z26.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "udot z9.s, z25.b, z19.b\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ "sqadd z10.s, z10.s, z18.s\n"
+ ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
+ "udot z9.s, z25.b, z12.b\n"
+ "and z28.d, z22.d, z3.d\n"
+ "and z23.d, z31.d, z3.d\n"
+ "movprfx z27, z9\n udot z27.s, z25.b, z24.b\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z21.d, z3.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "udot z9.s, z25.b, z13.b\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
+ "smax z10.s, p2/M, z10.s, z7.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z21.s, p2/M, z21.s, z7.s\n"
+ "st1b { z10.s }, p0, [x12, x28]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "st1b { z22.s }, p0, [x11, x28]\n"
+ "mov z26.d, z28.d\n"
+ "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z31.d, z28.d\n"
+ "udot z31.s, z1.b, z19.b\n"
+ "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x9, x28]\n"
+ "mov z22.d, z28.d\n"
+ "udot z28.s, z1.b, z13.b\n"
+ "udot z28.s, z15.b, z19.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "udot z26.s, z1.b, z13.b\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z18.s, #0x0\n"
+ "udot z22.s, z1.b, z19.b\n"
+ "udot z18.s, z25.b, z19.b\n"
+ "incw x28\n"
+ "udot z31.s, z15.b, z12.b\n"
+ "udot z28.s, z23.b, z12.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z26.s, z15.b, z19.b\n"
+ "udot z22.s, z15.b, z12.b\n"
"addvl %x[params], %x[params], #16\n"
- "udot z17.s, z12.b, z2.b\n"
- "udot z31.s, z20.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "udot z4.s, z20.b, z2.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "udot z26.s, z20.b, z29.b\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z29.b\n"
- "and z21.d, z10.d, z8.d\n"
- "udot z17.s, z12.b, z13.b\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "mov z19.s, #0x0\n"
- "udot z19.s, z12.b, z6.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "udot z19.s, z12.b, z1.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "udot z31.s, z21.b, z6.b\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z28.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "udot z10.s, z21.b, z11.b\n"
- "udot z10.s, z16.b, z6.b\n"
- "udot z19.s, z12.b, z11.b\n"
+ "udot z18.s, z25.b, z12.b\n"
+ "udot z31.s, z23.b, z24.b\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "mls z28.s, p2/M, z9.s, z8.s\n"
+ "udot z26.s, z23.b, z12.b\n"
+ ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
+ "udot z22.s, z23.b, z24.b\n"
+ "movprfx z12, z18\n udot z12.s, z25.b, z24.b\n"
+ "and z2.d, z28.d, z21.d\n"
+ "udot z18.s, z25.b, z13.b\n"
+ "mls z26.s, p2/M, z18.s, z8.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "mls z31.s, p2/M, z27.s, z8.s\n"
+ "mls z22.s, p2/M, z12.s, z8.s\n"
+ ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
+ ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z28.s, z28.s, z2.s\n"
+ "and z24.d, z26.d, z21.d\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "and z23.d, z31.d, z21.d\n"
+ "and z18.d, z22.d, z21.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z24.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z25.b, z11.b\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "st1b { z28.s }, p0, [x12, x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "st1b { z26.s }, p0, [x11, x28]\n"
+ "mov z28.d, z23.d\n"
+ "udot z24.s, z25.b, z20.b\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z27.d, z23.d\n"
+ "udot z27.s, z19.b, z11.b\n"
+ "movprfx z13, z24\n udot z13.s, z25.b, z0.b\n"
+ "st1b { z22.s }, p0, [x9, x28]\n"
+ "mov z26.d, z23.d\n"
+ "udot z23.s, z19.b, z14.b\n"
+ "udot z23.s, z30.b, z11.b\n"
+ "udot z24.s, z25.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z28.s, z19.b, z14.b\n"
"ext z11.b, z11.b, z11.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "udot z4.s, z21.b, z11.b\n"
- "ext z6.b, z6.b, z6.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "udot z26.s, z21.b, z6.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
- "udot z17.s, z12.b, z6.b\n"
- "udot z31.s, z16.b, z1.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z10.s, z20.b, z1.b\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "udot z4.s, z16.b, z6.b\n"
- "udot z26.s, z16.b, z1.b\n"
- "udot z17.s, z12.b, z1.b\n"
- "udot z31.s, z20.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "udot z4.s, z20.b, z1.b\n"
- "udot z26.s, z20.b, z28.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z28.b\n"
- "udot z17.s, z12.b, z11.b\n"
- "and z21.d, z10.d, z8.d\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ "mov z12.s, #0x0\n"
+ "udot z26.s, z19.b, z11.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "udot z12.s, z25.b, z11.b\n"
+ "udot z27.s, z30.b, z20.b\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z23.s, z21.b, z20.b\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "udot z28.s, z30.b, z11.b\n"
+ "udot z26.s, z30.b, z20.b\n"
+ "udot z12.s, z25.b, z20.b\n"
+ "udot z27.s, z21.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "mls z23.s, p2/M, z24.s, z8.s\n"
+ "udot z28.s, z21.b, z20.b\n"
+ "udot z26.s, z21.b, z0.b\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ "movprfx z19, z12\n udot z19.s, z25.b, z0.b\n"
+ "udot z12.s, z25.b, z14.b\n"
+ "and z18.d, z23.d, z22.d\n"
+ "mls z28.s, p2/M, z12.s, z8.s\n"
+ "mls z27.s, p2/M, z13.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "mls z26.s, p2/M, z19.s, z8.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "and z20.d, z28.d, z22.d\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "and z19.d, z27.d, z22.d\n"
+ "and z18.d, z26.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params]]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z20.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+ "sqadd z27.s, z27.s, z19.s\n"
+ "sqadd z26.s, z26.s, z18.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "smax z23.s, p2/M, z23.s, z7.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "smax z27.s, p2/M, z27.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z25.b, z17.b\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "st1b { z23.s }, p0, [x12, x28]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "st1b { z28.s }, p0, [x11, x28]\n"
+ "mov z0.d, z1.d\n"
+ "udot z24.s, z25.b, z4.b\n"
+ "st1b { z27.s }, p0, [x10, x28]\n"
+ "mov z31.d, z1.d\n"
+ "udot z31.s, z21.b, z17.b\n"
+ "movprfx z23, z24\n udot z23.s, z25.b, z5.b\n"
+ "st1b { z26.s }, p0, [x9, x28]\n"
+ "mov z30.d, z1.d\n"
+ "udot z1.s, z21.b, z29.b\n"
+ "udot z1.s, z13.b, z17.b\n"
+ "udot z24.s, z25.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z0.s, z21.b, z29.b\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
"mov z19.s, #0x0\n"
- "udot z19.s, z12.b, z5.b\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "mov z4.d, z10.d\n"
- "udot z19.s, z12.b, z0.b\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "mov z31.d, z10.d\n"
- "udot z31.s, z21.b, z5.b\n"
- "movprfx z18, z19\n udot z18.s, z12.b, z27.b\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "mov z26.d, z10.d\n"
- "udot z10.s, z21.b, z9.b\n"
- "udot z10.s, z16.b, z5.b\n"
- "udot z19.s, z12.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "udot z4.s, z21.b, z9.b\n"
+ "udot z30.s, z21.b, z17.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z19.s, z25.b, z17.b\n"
+ "udot z31.s, z13.b, z4.b\n"
+ "incw x28\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z1.s, z20.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "udot z0.s, z13.b, z17.b\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "udot z30.s, z13.b, z4.b\n"
+ "udot z19.s, z25.b, z4.b\n"
+ "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "udot z31.s, z20.b, z5.b\n"
"ext z5.b, z5.b, z5.b, #0x1\n"
- "mov z17.s, #0x0\n"
- "udot z26.s, z21.b, z5.b\n"
- "ld1w { z8.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "udot z17.s, z12.b, z5.b\n"
- "udot z31.s, z16.b, z0.b\n"
- "incw x20\n"
- "whilelt p0.s, x20, %x[n_channels]\n"
- "udot z10.s, z20.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "udot z4.s, z16.b, z5.b\n"
- "whilelt p2.b, x13, %x[n_channels]\n"
- "udot z26.s, z16.b, z0.b\n"
- "udot z17.s, z12.b, z0.b\n"
- "ld1b { z13.b }, p2/Z, [x11, x13]\n"
- "ld1b { z11.b }, p2/Z, [x10, x13]\n"
- "udot z31.s, z20.b, z27.b\n"
- "ext z27.b, z27.b, z27.b, #0x1\n"
- "mls z10.s, p1/M, z19.s, z23.s\n"
- "ld1b { z7.b }, p2/Z, [x27, x13]\n"
- "udot z4.s, z20.b, z0.b\n"
- "udot z26.s, z20.b, z27.b\n"
- ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
- "ld1b { z6.b }, p2/Z, [x26, x13]\n"
- "movprfx z16, z17\n udot z16.s, z12.b, z27.b\n"
- "udot z17.s, z12.b, z9.b\n"
- "and z21.d, z10.d, z8.d\n"
- "ld1b { z9.b }, p2/Z, [x9, x13]\n"
- "mls z4.s, p1/M, z17.s, z23.s\n"
- "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z1.s, p2/M, z24.s, z8.s\n"
+ "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+ "udot z0.s, z20.b, z4.b\n"
+ "udot z30.s, z20.b, z5.b\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+ "movprfx z18, z19\n udot z18.s, z25.b, z5.b\n"
+ "udot z19.s, z25.b, z29.b\n"
+ "and z11.d, z1.d, z22.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "mls z0.s, p2/M, z19.s, z8.s\n"
+ "mls z31.s, p2/M, z23.s, z8.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+ "mls z30.s, p2/M, z18.s, z8.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "sqadd z1.s, z1.s, z11.s\n"
+ "and z21.d, z0.d, z22.d\n"
+ ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z22.d\n"
+ "and z19.d, z30.d, z22.d\n"
+ "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z11.b }, p0/Z, [x22, x14]\n"
"asr z21.s, z21.s, #0x1f\n"
- "ld1b { z5.b }, p2/Z, [x25, x13]\n"
- "mls z26.s, p1/M, z16.s, z23.s\n"
- ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
- ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
- ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
- "ld1b { z14.b }, p2/Z, [x12, x13]\n"
- "ldp x12, x11, [%x[inptrs], #0x40]\n"
- "sqadd z10.s, z10.s, z21.s\n"
- "and z16.d, z4.d, z8.d\n"
- ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "and z20.d, z31.d, z8.d\n"
- "and z21.d, z26.d, z8.d\n"
- "ld1b { z3.b }, p2/Z, [x12, x13]\n"
- "ld1b { z2.b }, p2/Z, [x11, x13]\n"
- "asr z16.s, z16.s, #0x1f\n"
"asr z20.s, z20.s, #0x1f\n"
- "ld1b { z1.b }, p2/Z, [x10, x13]\n"
- "ld1b { z0.b }, p2/Z, [x9, x13]\n"
- "asr z21.s, z21.s, #0x1f\n"
- "sqadd z4.s, z4.s, z16.s\n"
- ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
- "ld1b { z16.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z21.s\n"
+ ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
"sqadd z31.s, z31.s, z20.s\n"
- "sqadd z26.s, z26.s, z21.s\n"
- ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
- ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
- "add z10.s, z10.s, z22.s\n"
- "smax z10.s, p1/M, z10.s, z25.s\n"
- "add z4.s, z4.s, z22.s\n"
- "ld1b { z8.b }, p2/Z, [x28, x13]\n"
- "add z31.s, z31.s, z22.s\n"
- "add z26.s, z26.s, z22.s\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x25, [%x[inptrs], #0x70]\n"
- "smin z10.s, p1/M, z10.s, z24.s\n"
- "smax z4.s, p1/M, z4.s, z25.s\n"
- "st1b { z10.s }, p0, [x24, x20]\n"
- "ld1b { z30.b }, p2/Z, [x28, x13]\n"
- "smax z31.s, p1/M, z31.s, z25.s\n"
- "smax z26.s, p1/M, z26.s, z25.s\n"
- "ld1b { z29.b }, p2/Z, [x27, x13]\n"
- "ld1b { z28.b }, p2/Z, [x26, x13]\n"
- "ld1b { z27.b }, p2/Z, [x25, x13]\n"
- "zip2 z10.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "smin z4.s, p1/M, z4.s, z24.s\n"
- "zip1 z11.b, z13.b, z9.b\n"
- "zip2 z9.b, z13.b, z9.b\n"
- "smin z31.s, p1/M, z31.s, z24.s\n"
- "smin z26.s, p1/M, z26.s, z24.s\n"
- "st1b { z4.s }, p0, [x23, x20]\n"
- "zip2 z13.b, z14.b, z11.b\n"
- "zip1 z14.b, z14.b, z11.b\n"
- "ldp x12, x11, [%x[inptrs], #0x0]\n"
- "st1b { z31.s }, p0, [x22, x20]\n"
- "zip1 z11.b, z10.b, z9.b\n"
- "zip2 z9.b, z10.b, z9.b\n"
- "ld1w { z10.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z26.s }, p0, [x21, x20]\n"
- "zip2 z4.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "incw x20\n"
- "zip1 z6.b, z7.b, z5.b\n"
- "zip2 z5.b, z7.b, z5.b\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "zip2 z31.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "ldp x26, x25, [%x[inptrs], #0x30]\n"
- "ld1b { z21.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "zip1 z1.b, z2.b, z0.b\n"
- "zip2 z0.b, z2.b, z0.b\n"
- "ld1b { z20.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "sqadd z30.s, z30.s, z19.s\n"
+ ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
+ ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "smax z1.s, p2/M, z1.s, z7.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z1.s, p2/M, z1.s, z6.s\n"
+ "smax z0.s, p2/M, z0.s, z7.s\n"
+ "st1b { z1.s }, p1, [x12, x28]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z20.b, z15.b, z28.b\n"
+ "zip1 z15.b, z15.b, z28.b\n"
+ "smin z0.s, p2/M, z0.s, z6.s\n"
+ "zip1 z19.b, z13.b, z29.b\n"
+ "zip2 z29.b, z13.b, z29.b\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "st1b { z0.s }, p1, [x11, x28]\n"
+ "zip2 z13.b, z15.b, z19.b\n"
+ "zip1 z15.b, z15.b, z19.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p1, [x10, x28]\n"
+ "zip1 z14.b, z20.b, z29.b\n"
+ "zip2 z29.b, z20.b, z29.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z30.s }, p1, [x9, x28]\n"
+ "zip2 z21.b, z9.b, z26.b\n"
+ "zip1 z9.b, z9.b, z26.b\n"
+ "incw x28\n"
+ "zip1 z20.b, z27.b, z17.b\n"
+ "zip2 z17.b, z27.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z27.b, z11.b, z4.b\n"
+ "zip2 z4.b, z11.b, z4.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip2 z26.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z29.b, z27.b\n"
- "zip2 z27.b, z29.b, z27.b\n"
- "zip2 z7.b, z8.b, z6.b\n"
- "zip1 z8.b, z8.b, z6.b\n"
- "zip1 z6.b, z4.b, z5.b\n"
- "zip2 z5.b, z4.b, z5.b\n"
- "zip2 z2.b, z3.b, z1.b\n"
- "zip1 z3.b, z3.b, z1.b\n"
- "zip1 z1.b, z31.b, z0.b\n"
- "zip2 z0.b, z31.b, z0.b\n"
- "zip2 z29.b, z30.b, z28.b\n"
- "zip1 z30.b, z30.b, z28.b\n"
- "zip1 z28.b, z26.b, z27.b\n"
- "zip2 z27.b, z26.b, z27.b\n"
- "mov z4.d, z10.d\n"
+ "zip2 z30.b, z2.b, z22.b\n"
+ "zip1 z2.b, z2.b, z22.b\n"
+ "zip1 z28.b, z23.b, z5.b\n"
+ "zip2 z5.b, z23.b, z5.b\n"
+ "zip2 z19.b, z9.b, z20.b\n"
+ "zip1 z9.b, z9.b, z20.b\n"
+ "zip1 z11.b, z21.b, z17.b\n"
+ "zip2 z17.b, z21.b, z17.b\n"
+ "zip2 z12.b, z18.b, z27.b\n"
+ "zip1 z18.b, z18.b, z27.b\n"
+ "zip1 z20.b, z31.b, z4.b\n"
+ "zip2 z4.b, z31.b, z4.b\n"
+ "zip2 z24.b, z2.b, z28.b\n"
+ "zip1 z2.b, z2.b, z28.b\n"
+ "zip1 z0.b, z30.b, z5.b\n"
+ "zip2 z5.b, z30.b, z5.b\n"
+ "mov z22.d, z10.d\n"
"mov z31.d, z10.d\n"
- "mov z26.d, z10.d\n"
+ "mov z21.d, z10.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 1cf20ef721..0300b71d7c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 386eb96cff..5c26010c0d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,320 +91,320 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x8, #0x0\n"
+ "mov x16, #0x0\n"
"ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
"ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x8\n"
+ "mov x23, x16\n"
"add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x15, x14, [x24, #0x0]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
"incw x23\n"
- "whilelt p3.h, x8, x17\n"
- "ldp x13, x12, [x24, #0x10]\n"
- "whilelt p2.s, x8, x17\n"
- "whilelt p1.s, x23, x17\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z0.h }, p4/Z, [x16]\n"
- "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1800 // usublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1821 // usublb z1.h, z1.b, z15.b\n"
- "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1842 // usublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1863 // usublb z3.h, z3.b, z15.b\n"
- "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1884 // usublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1b { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "mov z26.d, z13.d\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z24.d, z13.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z16.d, z17.d\n"
- "mov z25.d, z13.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z9.d, z17.d\n"
- ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f18e7 // usublb z7.h, z7.b, z15.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1908 // usublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z14.h }, p4/Z, [x14]\n"
+ "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
+ "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
+ "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1b { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"1:" // Loop
- ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x22, [x11, #0x28]\n"
- "ldr x27, [x11, #0x38]\n"
- ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- "ldr x21, [x11, #0x30]\n"
- "ldr x26, [x11, #0x40]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x11, #0x48]\n"
- "ld1b { z30.h }, p3/Z, [x20, x8]\n"
- ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
- "ldr x25, [x11, #0x50]\n"
- "ldr x24, [x11, #0x58]\n"
- ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- "ldr x23, [x11, #0x60]\n"
- ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- "ldr x22, [x11, #0x68]\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
- "ldr x20, [x11, #0x78]\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z19.s, z20.s, z18.s\n"
- ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
- ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
- "uzp2 z22.s, z20.s, z18.s\n"
- "ld1w { z20.s }, p2/Z, [x28]\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
- "ld1b { z31.h }, p3/Z, [x26, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "whilelt p0.h, x10, x17\n"
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
"inch x16\n"
- ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
- "ld1b { z31.h }, p3/Z, [x23, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
- "uzp1 z1.s, z20.s, z18.s\n"
- ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- "uzp2 z27.s, z20.s, z18.s\n"
- ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
- ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "inch x8\n"
- ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z21.d, z13.d, z1.d\n"
- "mov x20, x8\n"
- ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
"incw x20\n"
- ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
- "asr z21.s, z21.s, #0x1f\n"
- "whilelt p2.s, x8, x17\n"
- ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
- ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "and z20.d, z17.d, z27.d\n"
- "whilelt p1.s, x20, x17\n"
- ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
- ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
- "whilelt p3.h, x8, x17\n"
- ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
- "sqadd z13.s, z13.s, z21.s\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z26.d, z1.d\n"
- ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
- "and z18.d, z24.d, z1.d\n"
- ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
- "and z21.d, z25.d, z1.d\n"
- ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z10.d, z27.d\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z22.d, z16.d, z27.d\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z20.d, z9.d, z27.d\n"
- "sqadd z26.s, z26.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "asr z22.s, z22.s, #0x1f\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- "sqadd z25.s, z25.s, z21.s\n"
+ "and z7.d, z25.d, z1.d\n"
"asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- "sqadd z10.s, z10.s, z2.s\n"
- "sqadd z16.s, z16.s, z22.s\n"
- ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
- ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
- "sqadd z9.s, z9.s, z20.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
- ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
- ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
- ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
- ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
- ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z12.h\n"
- "smin z13.h, p4/M, z13.h, z11.h\n"
- "sqadd z26.h, z26.h, z14.h\n"
- "sqadd z24.h, z24.h, z14.h\n"
- "smax z26.h, p4/M, z26.h, z12.h\n"
- "smax z24.h, p4/M, z24.h, z12.h\n"
- "sqadd z25.h, z25.h, z14.h\n"
- "smax z25.h, p4/M, z25.h, z12.h\n"
- "smin z26.h, p4/M, z26.h, z11.h\n"
- "st1b { z13.h }, p0, [x15, x10]\n"
- "smin z24.h, p4/M, z24.h, z11.h\n"
- "smin z25.h, p4/M, z25.h, z11.h\n"
- "st1b { z26.h }, p0, [x14, x10]\n"
- "st1b { z24.h }, p0, [x13, x10]\n"
- "st1b { z25.h }, p0, [x12, x10]\n"
- "ld1b { z0.h }, p4/Z, [x16]\n"
- "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "inch x10\n"
- "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1800 // usublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1821 // usublb z1.h, z1.b, z15.b\n"
- "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1842 // usublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1863 // usublb z3.h, z3.b, z15.b\n"
- "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1884 // usublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1b { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z26.d, z13.d\n"
- "mov z10.d, z17.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z24.d, z13.d\n"
- "mov z16.d, z17.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z25.d, z13.d\n"
- "mov z9.d, z17.d\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
- ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f18e7 // usublb z7.h, z7.b, z15.b\n"
- ".inst 0x454f1908 // usublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1b { z14.h }, p4/Z, [x14]\n"
+ "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
+ "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
+ "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1b { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index a794095c6f..bcd0d60d3c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 9f21401840..1ea2fcbfbd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -110,13 +110,13 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z16.h }, p4/Z, [x21]\n"
- "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
"ldp x16, x15, [x24, #0x0]\n"
"incw x23\n"
"whilelt p3.h, x7, x8\n"
@@ -124,320 +124,320 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"whilelt p2.s, x7, x8\n"
"whilelt p1.s, x23, x8\n"
"ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z0.h }, p4/Z, [x17]\n"
- "ld1b { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "ld1b { z25.h }, p4/Z, [x17]\n"
+ "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"add x11, %x[params], %[offsetof_Params_inptrs]\n"
"mov x10, #0x0\n"
- "ld1b { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
- "ld1b { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ld1b { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
+ "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1b { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
+ ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1b { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
"addvl x12, x12, #2\n"
- "mov z9.d, z13.d\n"
+ "mov z18.d, z8.d\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z11.d, z13.d\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z22.d, z17.d\n"
- "mov z21.d, z13.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z18.d, z17.d\n"
- ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
- ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
"str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"1:" // Loop
- ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
- "ldr x25, [x11, #0x40]\n"
- "ldr x24, [x11, #0x48]\n"
- ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
- "ldr x22, [x11, #0x50]\n"
- "ldr x20, [x11, #0x58]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x23, [x11, #0x78]\n"
- "ldr x21, [x11, #0x60]\n"
- ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
- ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
- "ldr x22, [x11, #0x80]\n"
- "ldr x20, [x11, #0x68]\n"
- ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ldr x21, [x11, #0x88]\n"
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x21, x7]\n"
"ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x11, #0x70]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
- "ldr x25, [x11, #0x98]\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- "ldr x24, [x11, #0x90]\n"
- ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
- "ldr x23, [x11, #0xa8]\n"
- "ldr x20, [x11, #0xa0]\n"
- ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
- "ldr x22, [x11, #0xb0]\n"
- "ldr x21, [x11, #0xb8]\n"
- ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z31.s }, p2/Z, [x27]\n"
- ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
- "uzp1 z19.s, z31.s, z20.s\n"
- ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
- ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
- "uzp2 z30.s, z31.s, z20.s\n"
- "ld1w { z31.s }, p2/Z, [x26]\n"
- ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x7]\n"
- ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z1.s, z31.s, z20.s\n"
- ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
"ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
- "uzp2 z31.s, z31.s, z20.s\n"
- "inch x17\n"
- ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
- "and z0.d, z13.d, z1.d\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
"inch x7\n"
- ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
- ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
"mov x20, x7\n"
- ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
- "asr z0.s, z0.s, #0x1f\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
"incw x20\n"
- ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
- "and z20.d, z17.d, z31.d\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
"whilelt p2.s, x7, x8\n"
- ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
"whilelt p1.s, x20, x8\n"
- ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
"whilelt p3.h, x7, x8\n"
- "sqadd z13.s, z13.s, z0.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "addvl x27, x27, #2\n"
- "and z19.d, z9.d, z1.d\n"
- ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
- "addvl x26, x26, #2\n"
- "and z2.d, z11.d, z1.d\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "and z0.d, z21.d, z1.d\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
- "and z3.d, z10.d, z31.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "and z26.d, z22.d, z31.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z20.d, z18.d, z31.d\n"
- "sqadd z9.s, z9.s, z19.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z11.s, z11.s, z2.s\n"
- ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z21.s, z21.s, z0.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "sqadd z10.s, z10.s, z3.s\n"
- ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
- "sqadd z22.s, z22.s, z26.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
- ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
- ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
- ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
- ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
- ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "sqadd z9.h, z9.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z16.h\n"
- "smax z9.h, p4/M, z9.h, z16.h\n"
- "sqadd z11.h, z11.h, z14.h\n"
- "sqadd z21.h, z21.h, z14.h\n"
- "smax z11.h, p4/M, z11.h, z16.h\n"
- "smax z21.h, p4/M, z21.h, z16.h\n"
- "smin z13.h, p4/M, z13.h, z15.h\n"
- "smin z9.h, p4/M, z9.h, z15.h\n"
- "st1b { z13.h }, p0, [x16, x10]\n"
- "smin z11.h, p4/M, z11.h, z15.h\n"
- "smin z21.h, p4/M, z21.h, z15.h\n"
- "st1b { z9.h }, p0, [x15, x10]\n"
- "st1b { z11.h }, p0, [x14, x10]\n"
- "st1b { z21.h }, p0, [x13, x10]\n"
- "ld1b { z0.h }, p4/Z, [x17]\n"
- "ld1b { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1b { z25.h }, p4/Z, [x17]\n"
+ "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"inch x10\n"
- "ld1b { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
- "ld1b { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1b { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ld1b { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
+ "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1b { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1b { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z9.d, z13.d\n"
- "mov z10.d, z17.d\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z11.d, z13.d\n"
- "mov z22.d, z17.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z21.d, z13.d\n"
- "mov z18.d, z17.d\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
- ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
- ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
@@ -448,4 +448,4 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index ac0a00b245..dfaa059e9f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstS
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 40e2f5df25..b8adbb8262 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,542 +111,542 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x0, #0x0\n"
- "mov x24, x0\n"
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"incw x24\n"
- "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
"add x21, x23, %[offsetof_Requantize32_a_offset]\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z15.b }, p4/Z, [x21]\n"
- "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x3, x4, [x22, #0x0]\n"
- "whilelt p3.h, x0, x1\n"
- "ldp x5, x6, [x22, #0x10]\n"
- "whilelt p2.s, x0, x1\n"
- "whilelt p1.s, x24, x1\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1b { z0.h }, p4/Z, [x2]\n"
- "ld1b { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1b { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "mov x8, #0x0\n"
- "mov z20.d, z14.d\n"
- "ld1b { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z7.d, z10.d\n"
- "mov z8.d, z14.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z16.d, z10.d\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1b { z26.h }, p4/Z, [x4]\n"
+ "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
"mov z6.d, z14.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
- "mov z5.d, z10.d\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"1:" // Loop
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x7, #0x50]\n"
- "ld1b { z31.h }, p3/Z, [x20, x0]\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x7, #0x58]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x21, [x7, #0x60]\n"
- "ldr x20, [x7, #0x68]\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
- "ldr x25, [x7, #0x70]\n"
- "ldr x24, [x7, #0x78]\n"
- ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x15, [x7, #0x80]\n"
- "ldr x23, [x7, #0x88]\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
- "ld1b { z27.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- "ldr x22, [x7, #0x90]\n"
- "ldr x21, [x7, #0x98]\n"
- ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- "ldr x14, [x7, #0xa0]\n"
- "ldr x13, [x7, #0xa8]\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- "ldr x12, [x7, #0xb0]\n"
- "ldr x20, [x7, #0xb8]\n"
- ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- "ldr x11, [x7, #0xc0]\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
- "ldr x10, [x7, #0xc8]\n"
- "ldr x9, [x7, #0xd0]\n"
- ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- "ldr x28, [x7, #0xd8]\n"
- "ldr x27, [x7, #0xe0]\n"
- ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [x7, #0xe8]\n"
- "ldr x25, [x7, #0xf0]\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- "ld1w { z19.s }, p2/Z, [x17]\n"
- "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
- "ldr x24, [x7, #0xf8]\n"
- "uzp1 z9.s, z19.s, z18.s\n"
- ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- "uzp2 z29.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16]\n"
- ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
- "ld1b { z23.h }, p3/Z, [x15, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
- "ldr x23, [x7, #0x100]\n"
- "whilelt p0.h, x8, x1\n"
- ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- "addvl x17, x17, #2\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
- "ldr x22, [x7, #0x108]\n"
- ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
- "ld1b { z30.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
- "ldr x21, [x7, #0x110]\n"
- ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- "ld1b { z26.h }, p3/Z, [x14, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
- ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x7, #0x118]\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x13, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
- ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x12, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x11, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
- "ld1b { z23.h }, p3/Z, [x10, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
- "ld1b { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
- "ld1b { z26.h }, p3/Z, [x26, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1b { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1b { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
- ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
- "ld1b { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
- "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
- "ld1b { z4.h }, p4/Z, [x2]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- "inch x2\n"
- ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1b { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1b { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
- "uzp1 z23.s, z19.s, z18.s\n"
- ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
- "uzp2 z22.s, z19.s, z18.s\n"
- ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
- "inch x0\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
- "and z21.d, z14.d, z23.d\n"
- "mov x20, x0\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a196b // usublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1b { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
"incw x20\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
- ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
- "whilelt p2.s, x0, x1\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- "and z3.d, z10.d, z22.d\n"
- "whilelt p1.s, x20, x1\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
- "whilelt p3.h, x0, x1\n"
- ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sqadd z14.s, z14.s, z21.s\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
"asr z3.s, z3.s, #0x1f\n"
- ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
- "and z19.d, z20.d, z23.d\n"
- ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
- "and z18.d, z8.d, z23.d\n"
- ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
- "and z21.d, z6.d, z23.d\n"
- ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
- "sqadd z10.s, z10.s, z3.s\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
- "and z1.d, z7.d, z22.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z2.d, z16.d, z22.d\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
"asr z21.s, z21.s, #0x1f\n"
- "and z3.d, z5.d, z22.d\n"
- "sqadd z20.s, z20.s, z19.s\n"
- ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z8.s, z8.s, z18.s\n"
- ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "sqadd z6.s, z6.s, z21.s\n"
- ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z7.s, z7.s, z1.s\n"
- ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z5.s, z5.s, z3.s\n"
- ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
- ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
- ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
- ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
- ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
- "sqadd z14.h, z14.h, z12.h\n"
- "sqadd z20.h, z20.h, z12.h\n"
- "smax z14.h, p4/M, z14.h, z13.h\n"
- "smax z20.h, p4/M, z20.h, z13.h\n"
- "sqadd z8.h, z8.h, z12.h\n"
- "sqadd z6.h, z6.h, z12.h\n"
- "smax z8.h, p4/M, z8.h, z13.h\n"
- "smax z6.h, p4/M, z6.h, z13.h\n"
- "smin z14.h, p4/M, z14.h, z11.h\n"
- "smin z20.h, p4/M, z20.h, z11.h\n"
- "st1b { z14.h }, p0, [x3, x8]\n"
- "smin z8.h, p4/M, z8.h, z11.h\n"
- "smin z6.h, p4/M, z6.h, z11.h\n"
- "st1b { z20.h }, p0, [x4, x8]\n"
- "st1b { z8.h }, p0, [x5, x8]\n"
- "st1b { z6.h }, p0, [x6, x8]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1b { z0.h }, p4/Z, [x2]\n"
- "ld1b { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1b { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "inch x8\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z20.d, z14.d\n"
- "mov z7.d, z10.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z8.d, z14.d\n"
- "mov z16.d, z10.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1b { z26.h }, p4/Z, [x4]\n"
+ "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
"mov z6.d, z14.d\n"
- "mov z5.d, z10.d\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
- ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
- ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 81c954a11b..d5382533a8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 66c24c34b5..a9cd8a7fa9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -47,285 +47,285 @@ void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"ldr x20, [%x[inptrs], #0x10]\n"
"ldr x22, [%x[inptrs], #0x20]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "mov z15.b, #0x1\n"
- "lsr z15.s, z15.s, #0x8\n"
+ "mov z13.b, #0x1\n"
+ "lsr z13.s, z13.s, #0x8\n"
"ld1b { z1.b }, p0/Z, [x23]\n"
"ld1b { z2.b }, p0/Z, [x20]\n"
- "mov z30.d, z1.d\n"
- "mov z29.d, z1.d\n"
+ "mov z8.d, z1.d\n"
+ "mov z27.d, z1.d\n"
"ldr x20, [%x[inptrs], #0x18]\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z28.d, z1.d\n"
- "mov z27.d, z2.d\n"
+ "mov z31.d, z1.d\n"
+ "mov z28.d, z2.d\n"
"ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z30.d, z2.d\n"
"mov z26.d, z2.d\n"
- "mov z25.d, z2.d\n"
"ld1b { z3.b }, p0/Z, [x20]\n"
- "mov z24.d, z4.d\n"
- "mov z23.d, z4.d\n"
- "ptrue p2.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z22.d, z4.d\n"
- "ext z30.b, z30.b, z30.b, #0x2\n"
+ "mov z10.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z18.d, z4.d\n"
+ "ext z8.b, z8.b, z8.b, #0x2\n"
"lsl x10, %x[n_channels], #0x2\n"
- "neg z14.s, p2/M, z14.s\n"
- "ext z29.b, z29.b, z29.b, #0x4\n"
- "ext z28.b, z28.b, z28.b, #0x6\n"
+ "neg z11.s, p2/M, z11.s\n"
+ "ext z27.b, z27.b, z27.b, #0x4\n"
+ "ext z31.b, z31.b, z31.b, #0x6\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
- "ext z27.b, z27.b, z27.b, #0x2\n"
- "ext z26.b, z26.b, z26.b, #0x4\n"
- "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ext z28.b, z28.b, z28.b, #0x2\n"
+ "ext z30.b, z30.b, z30.b, #0x4\n"
+ "ld1w { z14.s }, p0/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "ext z25.b, z25.b, z25.b, #0x6\n"
- "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ext z22.b, z22.b, z22.b, #0x2\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "ext z23.b, z23.b, z23.b, #0x4\n"
- "ext z22.b, z22.b, z22.b, #0x6\n"
+ "ext z10.b, z10.b, z10.b, #0x4\n"
+ "ext z18.b, z18.b, z18.b, #0x6\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"mov z21.d, z0.d\n"
"mov z20.d, z0.d\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"mov z19.d, z0.d\n"
- "mov z18.d, z3.d\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z24.d, z3.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z17.d, z3.d\n"
"mov z16.d, z3.d\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"ext z21.b, z21.b, z21.b, #0x2\n"
"ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
"ext z19.b, z19.b, z19.b, #0x6\n"
- "zip1 z1.s, z1.s, z29.s\n"
- "zip1 z30.s, z30.s, z28.s\n"
- "zip1 z2.s, z2.s, z26.s\n"
- "zip1 z27.s, z27.s, z25.s\n"
- "ext z18.b, z18.b, z18.b, #0x2\n"
+ "zip1 z1.s, z1.s, z27.s\n"
+ "zip1 z8.s, z8.s, z31.s\n"
+ "zip1 z2.s, z2.s, z30.s\n"
+ "zip1 z28.s, z28.s, z26.s\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
"ext z17.b, z17.b, z17.b, #0x4\n"
"ext z16.b, z16.b, z16.b, #0x6\n"
- "zip1 z4.s, z4.s, z23.s\n"
- "zip1 z24.s, z24.s, z22.s\n"
+ "zip1 z4.s, z4.s, z10.s\n"
+ "zip1 z22.s, z22.s, z18.s\n"
"zip1 z0.s, z0.s, z20.s\n"
"zip1 z21.s, z21.s, z19.s\n"
- "zip1 z1.s, z1.s, z30.s\n"
- "zip1 z2.s, z2.s, z27.s\n"
+ "zip1 z1.s, z1.s, z8.s\n"
+ "zip1 z2.s, z2.s, z28.s\n"
"zip1 z3.s, z3.s, z17.s\n"
- "zip1 z18.s, z18.s, z16.s\n"
- "zip1 z4.s, z4.s, z24.s\n"
+ "zip1 z24.s, z24.s, z16.s\n"
+ "zip1 z4.s, z4.s, z22.s\n"
"zip1 z0.s, z0.s, z21.s\n"
"mov z1.q, z1.q[0]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z3.s, z3.s, z18.s\n"
+ "zip1 z3.s, z3.s, z24.s\n"
"mov z4.q, z4.q[0]\n"
"mov z24.s, #0x0\n"
"mov z25.s, #0x0\n"
- "udot z24.s, z15.b, z1.b[0]\n"
+ "udot z24.s, z13.b, z1.b[0]\n"
"mov z23.s, #0x0\n"
"mov z22.s, #0x0\n"
- "udot z25.s, z15.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z1.b[1]\n"
"mov z21.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "udot z23.s, z15.b, z1.b[2]\n"
- "mov z9.s, #0x0\n"
- "mov z8.s, #0x0\n"
- "udot z22.s, z15.b, z1.b[3]\n"
"mov z19.s, #0x0\n"
+ "udot z23.s, z13.b, z1.b[2]\n"
+ "mov z10.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "udot z22.s, z13.b, z1.b[3]\n"
+ "mov z20.s, #0x0\n"
"mov z18.s, #0x0\n"
- "udot z21.s, z15.b, z2.b[0]\n"
+ "udot z21.s, z13.b, z2.b[0]\n"
"mov z17.s, #0x0\n"
"mov z16.s, #0x0\n"
- "udot z20.s, z15.b, z2.b[1]\n"
- "udot z9.s, z15.b, z2.b[2]\n"
- "udot z8.s, z15.b, z2.b[3]\n"
+ "udot z19.s, z13.b, z2.b[1]\n"
+ "udot z10.s, z13.b, z2.b[2]\n"
+ "udot z8.s, z13.b, z2.b[3]\n"
"mov z0.q, z0.q[0]\n"
- "udot z19.s, z15.b, z4.b[0]\n"
- "udot z18.s, z15.b, z4.b[1]\n"
+ "udot z20.s, z13.b, z4.b[0]\n"
+ "udot z18.s, z13.b, z4.b[1]\n"
"mov z3.q, z3.q[0]\n"
- "udot z17.s, z15.b, z4.b[2]\n"
- "udot z16.s, z15.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[2]\n"
+ "udot z16.s, z13.b, z4.b[3]\n"
"mov z31.s, #0x0\n"
"mov z30.s, #0x0\n"
- "mov z29.s, #0x0\n"
- "udot z31.s, z15.b, z0.b[0]\n"
+ "mov z26.s, #0x0\n"
+ "udot z31.s, z13.b, z0.b[0]\n"
+ "mov z27.s, #0x0\n"
"mov z28.s, #0x0\n"
- "udot z30.s, z15.b, z0.b[1]\n"
- "udot z29.s, z15.b, z0.b[2]\n"
- "udot z28.s, z15.b, z0.b[3]\n"
+ "udot z30.s, z13.b, z0.b[1]\n"
+ "mov z29.s, #0x0\n"
+ "udot z26.s, z13.b, z0.b[2]\n"
+ "udot z27.s, z13.b, z0.b[3]\n"
+ "udot z28.s, z13.b, z3.b[0]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
"add z24.s, z24.s, z21.s\n"
- "add z25.s, z25.s, z20.s\n"
- "add z26.s, z23.s, z9.s\n"
- "add z27.s, z22.s, z8.s\n"
- "add z23.s, z19.s, z21.s\n"
- "mov z22.s, #0x0\n"
- "udot z22.s, z15.b, z3.b[0]\n"
- "add z21.s, z18.s, z20.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z23.s, z23.s, z10.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "add z21.s, z20.s, z21.s\n"
"mov z20.s, #0x0\n"
- "udot z20.s, z15.b, z3.b[1]\n"
- "add z19.s, z17.s, z9.s\n"
+ "udot z20.s, z13.b, z3.b[2]\n"
+ "add z19.s, z18.s, z19.s\n"
"mov z18.s, #0x0\n"
- "udot z18.s, z15.b, z3.b[2]\n"
- "add z17.s, z16.s, z8.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z15.b, z3.b[3]\n"
+ "udot z18.s, z13.b, z3.b[3]\n"
+ "add z17.s, z17.s, z10.s\n"
+ "add z16.s, z16.s, z8.s\n"
"add z24.s, z24.s, z31.s\n"
"add z25.s, z25.s, z30.s\n"
- "mul z24.s, p2/M, z24.s, z14.s\n"
- "mul z25.s, p2/M, z25.s, z14.s\n"
- "add z26.s, z26.s, z29.s\n"
- "add z27.s, z27.s, z28.s\n"
- "mul z26.s, p2/M, z26.s, z14.s\n"
- "mul z27.s, p2/M, z27.s, z14.s\n"
- "add z28.s, z23.s, z22.s\n"
- "add z29.s, z21.s, z20.s\n"
- "mul z28.s, p2/M, z28.s, z14.s\n"
- "mul z29.s, p2/M, z29.s, z14.s\n"
- "add z30.s, z19.s, z18.s\n"
- "add z31.s, z17.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z14.s\n"
- "mul z31.s, p2/M, z31.s, z14.s\n"
+ "mul z24.s, p2/M, z24.s, z11.s\n"
+ "mul z25.s, p2/M, z25.s, z11.s\n"
+ "add z26.s, z23.s, z26.s\n"
+ "add z27.s, z22.s, z27.s\n"
+ "mul z26.s, p2/M, z26.s, z11.s\n"
+ "mul z27.s, p2/M, z27.s, z11.s\n"
+ "add z28.s, z21.s, z28.s\n"
+ "add z29.s, z19.s, z29.s\n"
+ "mul z28.s, p2/M, z28.s, z11.s\n"
+ "mul z29.s, p2/M, z29.s, z11.s\n"
+ "add z30.s, z17.s, z20.s\n"
+ "add z31.s, z16.s, z18.s\n"
+ "mul z30.s, p2/M, z30.s, z11.s\n"
+ "mul z31.s, p2/M, z31.s, z11.s\n"
"zip1 z19.s, z24.s, z26.s\n"
"zip1 z18.s, z25.s, z27.s\n"
"zip1 z17.s, z28.s, z30.s\n"
"zip1 z16.s, z29.s, z31.s\n"
"zip1 z22.s, z19.s, z18.s\n"
"zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
"1:" // Loop
"udot z24.s, z5.b, z0.b[0]\n"
"udot z25.s, z5.b, z0.b[1]\n"
- "ld1w { z21.s }, p2/Z, [%x[params]]\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"udot z26.s, z5.b, z0.b[2]\n"
"udot z27.s, z5.b, z0.b[3]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"udot z24.s, z6.b, z1.b[0]\n"
"udot z25.s, z6.b, z1.b[1]\n"
- "whilelt p1.b, x9, x10\n"
- "ld1w { z13.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
"udot z26.s, z6.b, z1.b[2]\n"
"udot z27.s, z6.b, z1.b[3]\n"
"udot z28.s, z5.b, z2.b[0]\n"
"udot z29.s, z5.b, z2.b[1]\n"
"udot z30.s, z5.b, z2.b[2]\n"
"udot z31.s, z5.b, z2.b[3]\n"
- "ld1b { z5.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"udot z24.s, z7.b, z2.b[0]\n"
"udot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x04b57718 // sqrdmulh z24.s, z24.s, z21.s\n"
+ ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
"udot z26.s, z7.b, z2.b[2]\n"
"udot z27.s, z7.b, z2.b[3]\n"
- ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
"udot z28.s, z6.b, z3.b[0]\n"
"udot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x04b5775a // sqrdmulh z26.s, z26.s, z21.s\n"
+ ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
"udot z30.s, z6.b, z3.b[2]\n"
"udot z31.s, z6.b, z3.b[3]\n"
- ".inst 0x04b5777b // sqrdmulh z27.s, z27.s, z21.s\n"
- "ld1b { z6.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"udot z28.s, z7.b, z4.b[0]\n"
"udot z29.s, z7.b, z4.b[1]\n"
- "and z19.d, z24.d, z20.d\n"
+ "and z19.d, z24.d, z21.d\n"
"udot z30.s, z7.b, z4.b[2]\n"
"udot z31.s, z7.b, z4.b[3]\n"
- "and z18.d, z25.d, z20.d\n"
- "ld1b { z7.b }, p1/Z, [%x[params], #5, MUL VL]\n"
- "and z17.d, z26.d, z20.d\n"
- "and z16.d, z27.d, z20.d\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #6\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b5779c // sqrdmulh z28.s, z28.s, z21.s\n"
- ".inst 0x04b577bd // sqrdmulh z29.s, z29.s, z21.s\n"
- ".inst 0x04b577de // sqrdmulh z30.s, z30.s, z21.s\n"
- ".inst 0x04b577ff // sqrdmulh z31.s, z31.s, z21.s\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
+ ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
+ ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
"sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
- ".inst 0x44828a9b // srshl z27.s, p2/M, z27.s, z20.s\n"
- "and z19.d, z28.d, z20.d\n"
- "and z18.d, z29.d, z20.d\n"
- "and z17.d, z30.d, z20.d\n"
- "and z16.d, z31.d, z20.d\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
"sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
- ".inst 0x44828a9d // srshl z29.s, p2/M, z29.s, z20.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a9e // srshl z30.s, p2/M, z30.s, z20.s\n"
- ".inst 0x44828a9f // srshl z31.s, p2/M, z31.s, z20.s\n"
- "add z24.s, z24.s, z12.s\n"
- "add z25.s, z25.s, z12.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "smin z25.s, p2/M, z25.s, z10.s\n"
- "add z26.s, z26.s, z12.s\n"
- "add z27.s, z27.s, z12.s\n"
- "smin z26.s, p2/M, z26.s, z10.s\n"
- "smin z27.s, p2/M, z27.s, z10.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "smin z28.s, p2/M, z28.s, z10.s\n"
- "smin z29.s, p2/M, z29.s, z10.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
- "smin z30.s, p2/M, z30.s, z10.s\n"
- "smin z31.s, p2/M, z31.s, z10.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smax z25.s, p2/M, z25.s, z11.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z9.s\n"
+ "add z25.s, z25.s, z9.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "add z26.s, z26.s, z9.s\n"
+ "add z27.s, z27.s, z9.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "add z28.s, z28.s, z9.s\n"
+ "add z29.s, z29.s, z9.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "add z30.s, z30.s, z9.s\n"
+ "add z31.s, z31.s, z9.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "smin z31.s, p2/M, z31.s, z12.s\n"
+ "smax z24.s, p2/M, z24.s, z15.s\n"
+ "smax z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z11.s\n"
- "smax z27.s, p2/M, z27.s, z11.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z15.s\n"
+ "smax z27.s, p2/M, z27.s, z15.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z11.s\n"
- "smax z29.s, p2/M, z29.s, z11.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z15.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z11.s\n"
- "smax z31.s, p2/M, z31.s, z11.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z15.s\n"
+ "smax z31.s, p2/M, z31.s, z15.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z13.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z13.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z13.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z13.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index e7173de65a..55b6edea2c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index debaa8c296..4b65a67309 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -47,8 +47,8 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ldr x21, [%x[inptrs], #0x20]\n"
"ldr x20, [%x[inptrs], #0x10]\n"
"ld1b { z3.b }, p0/Z, [x22]\n"
- "mov z20.d, z3.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z23.d, z3.d\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
"ld1b { z4.b }, p0/Z, [x21]\n"
"ldr x24, [%x[inptrs], #0x8]\n"
"mov z18.d, z4.d\n"
@@ -59,132 +59,132 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"ext z15.b, z15.b, z15.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x30]\n"
"ldr x21, [%x[inptrs], #0x38]\n"
- "zip1 z3.d, z3.d, z20.d\n"
+ "zip1 z3.d, z3.d, z23.d\n"
"zip1 z4.d, z4.d, z18.d\n"
"ldr x20, [%x[inptrs], #0x0]\n"
"ld1b { z1.b }, p0/Z, [x24]\n"
- "mov z20.d, z1.d\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
+ "mov z19.d, z1.d\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
"ld1b { z5.b }, p0/Z, [x23]\n"
"ld1b { z6.b }, p0/Z, [x22]\n"
- "mov z13.d, z5.d\n"
- "mov z19.d, z6.d\n"
+ "mov z18.d, z5.d\n"
+ "mov z22.d, z6.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
"ld1b { z0.b }, p0/Z, [x20]\n"
- "mov z25.d, z7.d\n"
+ "mov z8.d, z7.d\n"
"zip1 z2.d, z2.d, z15.d\n"
"mov z3.q, z3.q[0]\n"
"mov z4.q, z4.q[0]\n"
"ptrue p2.b\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ext z22.b, z22.b, z22.b, #0x1\n"
"lsl x10, %x[n_channels], #0x2\n"
"neg z23.s, p2/M, z23.s\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- "mov z30.b, #0x1\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "mov z28.b, #0x1\n"
"mov x9, #0x0\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
+ "mov z25.s, #0x0\n"
"mov z24.s, #0x0\n"
- "mov z28.s, #0x0\n"
- "udot z24.s, z30.b, z3.b[0]\n"
- "ld1w { z12.s }, p1/Z, [%x[params]]\n"
- "mov z18.s, #0x0\n"
+ "udot z25.s, z28.b, z3.b[0]\n"
+ "ld1w { z12.s }, p0/Z, [%x[params]]\n"
"mov z17.s, #0x0\n"
- "udot z28.s, z30.b, z3.b[2]\n"
+ "mov z16.s, #0x0\n"
+ "udot z24.s, z28.b, z3.b[2]\n"
"mov x28, #0x0\n"
- "mov z16.d, z0.d\n"
- "udot z18.s, z30.b, z4.b[0]\n"
- "udot z17.s, z30.b, z4.b[2]\n"
+ "mov z27.d, z0.d\n"
+ "udot z17.s, z28.b, z4.b[0]\n"
+ "udot z16.s, z28.b, z4.b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "zip1 z1.d, z1.d, z20.d\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z5.d, z5.d, z13.d\n"
+ "zip1 z5.d, z5.d, z18.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z6.d, z6.d, z19.d\n"
- "zip1 z7.d, z7.d, z25.d\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z6.d, z6.d, z22.d\n"
+ "zip1 z7.d, z7.d, z8.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov z26.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "udot z26.s, z30.b, z2.b[0]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "udot z30.s, z28.b, z2.b[0]\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
"mov z29.s, #0x1\n"
- "udot z22.s, z30.b, z2.b[2]\n"
- "udot z24.s, z29.b, z3.b[1]\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z0.d, z0.d, z16.d\n"
+ "udot z31.s, z28.b, z2.b[2]\n"
+ "udot z25.s, z29.b, z3.b[1]\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z0.d, z0.d, z27.d\n"
"mov z1.q, z1.q[0]\n"
- "udot z28.s, z29.b, z3.b[3]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z29.b, z3.b[3]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
"mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "udot z18.s, z29.b, z4.b[1]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "udot z17.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
"mov z7.q, z7.q[0]\n"
- "mov z21.s, #0x0\n"
- "udot z17.s, z29.b, z4.b[3]\n"
+ "mov z22.s, #0x0\n"
+ "udot z16.s, z29.b, z4.b[3]\n"
"addvl %x[params], %x[params], #5\n"
- "mov z20.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "udot z21.s, z30.b, z1.b[0]\n"
+ "mov z21.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "udot z22.s, z28.b, z1.b[0]\n"
"mov z27.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "udot z21.s, z28.b, z1.b[2]\n"
"mov z19.s, #0x0\n"
- "udot z20.s, z30.b, z1.b[2]\n"
- "udot z25.s, z30.b, z5.b[0]\n"
- "udot z27.s, z30.b, z5.b[2]\n"
- "mov z0.q, z0.q[0]\n"
- "udot z19.s, z30.b, z6.b[0]\n"
- "udot z26.s, z29.b, z2.b[1]\n"
- "add z24.s, z24.s, z18.s\n"
"mov z18.s, #0x0\n"
- "udot z18.s, z30.b, z6.b[2]\n"
- "udot z22.s, z29.b, z2.b[3]\n"
- "add z17.s, z28.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z7.b[0]\n"
- "udot z21.s, z29.b, z1.b[1]\n"
- "udot z20.s, z29.b, z1.b[3]\n"
- "add z28.s, z26.s, z24.s\n"
- "udot z25.s, z29.b, z5.b[1]\n"
+ "udot z26.s, z28.b, z5.b[0]\n"
+ "udot z27.s, z28.b, z5.b[2]\n"
+ "udot z20.s, z28.b, z6.b[0]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z19.s, z28.b, z6.b[2]\n"
+ "udot z18.s, z28.b, z7.b[0]\n"
+ "add z17.s, z25.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "udot z25.s, z28.b, z7.b[2]\n"
+ "udot z30.s, z29.b, z2.b[1]\n"
+ "udot z31.s, z29.b, z2.b[3]\n"
+ "add z16.s, z24.s, z16.s\n"
+ "udot z22.s, z29.b, z1.b[1]\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z28.b, z0.b[0]\n"
+ "udot z21.s, z29.b, z1.b[3]\n"
+ "udot z26.s, z29.b, z5.b[1]\n"
"udot z27.s, z29.b, z5.b[3]\n"
- "add z31.s, z22.s, z17.s\n"
- "udot z19.s, z29.b, z6.b[1]\n"
- "udot z18.s, z29.b, z6.b[3]\n"
- "add z22.s, z21.s, z28.s\n"
- "udot z16.s, z29.b, z7.b[1]\n"
- "add z21.s, z20.s, z31.s\n"
- "add z20.s, z25.s, z19.s\n"
- "add z19.s, z27.s, z18.s\n"
- "add z18.s, z16.s, z24.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z7.b[2]\n"
- "udot z16.s, z29.b, z7.b[3]\n"
- "add z17.s, z16.s, z17.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z0.b[0]\n"
- "udot z16.s, z29.b, z0.b[1]\n"
- "add z24.s, z22.s, z16.s\n"
- "add z26.s, z22.s, z25.s\n"
+ "add z30.s, z30.s, z17.s\n"
+ "udot z20.s, z29.b, z6.b[1]\n"
+ "udot z19.s, z29.b, z6.b[3]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "udot z18.s, z29.b, z7.b[1]\n"
+ "udot z25.s, z29.b, z7.b[3]\n"
+ "add z22.s, z22.s, z30.s\n"
+ "udot z24.s, z29.b, z0.b[1]\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z20.s, z26.s, z20.s\n"
+ "add z19.s, z27.s, z19.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "udot z17.s, z28.b, z0.b[2]\n"
+ "udot z17.s, z29.b, z0.b[3]\n"
+ "add z16.s, z25.s, z16.s\n"
+ "add z24.s, z22.s, z24.s\n"
+ "add z25.s, z21.s, z17.s\n"
"mul z24.s, p2/M, z24.s, z23.s\n"
- "mul z26.s, p2/M, z26.s, z23.s\n"
- "mov z16.s, #0x0\n"
- "udot z16.s, z30.b, z0.b[2]\n"
- "udot z16.s, z29.b, z0.b[3]\n"
- "add z25.s, z21.s, z16.s\n"
- "add z27.s, z21.s, z27.s\n"
"mul z25.s, p2/M, z25.s, z23.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
"mul z27.s, p2/M, z27.s, z23.s\n"
- "add z28.s, z20.s, z28.s\n"
+ "add z28.s, z20.s, z30.s\n"
"add z29.s, z19.s, z31.s\n"
"mul z28.s, p2/M, z28.s, z23.s\n"
"mul z29.s, p2/M, z29.s, z23.s\n"
- "add z30.s, z18.s, z20.s\n"
- "add z31.s, z17.s, z19.s\n"
+ "add z30.s, z20.s, z18.s\n"
+ "add z31.s, z19.s, z16.s\n"
"mul z30.s, p2/M, z30.s, z23.s\n"
"mul z31.s, p2/M, z31.s, z23.s\n"
"zip1 z19.s, z24.s, z26.s\n"
@@ -204,22 +204,22 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"1:" // Loop
"udot z24.s, z8.b, z0.b[0]\n"
"udot z25.s, z8.b, z0.b[2]\n"
- "ld1w { z17.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"udot z26.s, z8.b, z1.b[0]\n"
"udot z27.s, z8.b, z1.b[2]\n"
"incb x9\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
"udot z24.s, z9.b, z0.b[1]\n"
"udot z25.s, z9.b, z0.b[3]\n"
- "whilelt p1.b, x9, x10\n"
+ "whilelt p0.b, x9, x10\n"
"udot z26.s, z9.b, z1.b[1]\n"
"udot z27.s, z9.b, z1.b[3]\n"
"udot z28.s, z8.b, z2.b[0]\n"
"udot z29.s, z8.b, z2.b[2]\n"
"udot z30.s, z8.b, z3.b[0]\n"
"udot z31.s, z8.b, z3.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
"udot z24.s, z10.b, z1.b[0]\n"
"udot z25.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[0]\n"
@@ -228,7 +228,7 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"udot z29.s, z9.b, z2.b[3]\n"
"udot z30.s, z9.b, z3.b[1]\n"
"udot z31.s, z9.b, z3.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
"udot z24.s, z11.b, z1.b[1]\n"
"udot z25.s, z11.b, z1.b[3]\n"
"udot z26.s, z11.b, z2.b[1]\n"
@@ -237,158 +237,158 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"udot z29.s, z10.b, z3.b[2]\n"
"udot z30.s, z10.b, z4.b[0]\n"
"udot z31.s, z10.b, z4.b[2]\n"
- "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z25.s, z8.b, z2.b[2]\n"
- "udot z26.s, z8.b, z3.b[0]\n"
- "udot z27.s, z8.b, z3.b[2]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "udot z24.s, z17.b, z2.b[0]\n"
+ "udot z25.s, z17.b, z2.b[2]\n"
+ "udot z26.s, z17.b, z3.b[0]\n"
+ "udot z27.s, z17.b, z3.b[2]\n"
"udot z28.s, z11.b, z3.b[1]\n"
"udot z29.s, z11.b, z3.b[3]\n"
"udot z30.s, z11.b, z4.b[1]\n"
"udot z31.s, z11.b, z4.b[3]\n"
- "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "udot z24.s, z9.b, z2.b[1]\n"
- "udot z25.s, z9.b, z2.b[3]\n"
- "udot z26.s, z9.b, z3.b[1]\n"
- "udot z27.s, z9.b, z3.b[3]\n"
- "udot z28.s, z8.b, z4.b[0]\n"
- "udot z29.s, z8.b, z4.b[2]\n"
- "udot z30.s, z8.b, z5.b[0]\n"
- "udot z31.s, z8.b, z5.b[2]\n"
- "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "udot z24.s, z10.b, z3.b[0]\n"
- "udot z25.s, z10.b, z3.b[2]\n"
- "udot z26.s, z10.b, z4.b[0]\n"
- "udot z27.s, z10.b, z4.b[2]\n"
- "udot z28.s, z9.b, z4.b[1]\n"
- "udot z29.s, z9.b, z4.b[3]\n"
- "udot z30.s, z9.b, z5.b[1]\n"
- "udot z31.s, z9.b, z5.b[3]\n"
- "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z16.b, z2.b[1]\n"
+ "udot z25.s, z16.b, z2.b[3]\n"
+ "udot z26.s, z16.b, z3.b[1]\n"
+ "udot z27.s, z16.b, z3.b[3]\n"
+ "udot z28.s, z17.b, z4.b[0]\n"
+ "udot z29.s, z17.b, z4.b[2]\n"
+ "udot z30.s, z17.b, z5.b[0]\n"
+ "udot z31.s, z17.b, z5.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "udot z24.s, z19.b, z3.b[0]\n"
+ "udot z25.s, z19.b, z3.b[2]\n"
+ "udot z26.s, z19.b, z4.b[0]\n"
+ "udot z27.s, z19.b, z4.b[2]\n"
+ "udot z28.s, z16.b, z4.b[1]\n"
+ "udot z29.s, z16.b, z4.b[3]\n"
+ "udot z30.s, z16.b, z5.b[1]\n"
+ "udot z31.s, z16.b, z5.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "udot z24.s, z11.b, z3.b[1]\n"
- "udot z25.s, z11.b, z3.b[3]\n"
- "ld1w { z12.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "udot z26.s, z11.b, z4.b[1]\n"
- "udot z27.s, z11.b, z4.b[3]\n"
- "udot z28.s, z10.b, z5.b[0]\n"
- "udot z29.s, z10.b, z5.b[2]\n"
- "udot z30.s, z10.b, z6.b[0]\n"
- "udot z31.s, z10.b, z6.b[2]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "udot z24.s, z8.b, z4.b[0]\n"
- "udot z25.s, z8.b, z4.b[2]\n"
- "udot z26.s, z8.b, z5.b[0]\n"
- "udot z27.s, z8.b, z5.b[2]\n"
- "udot z28.s, z11.b, z5.b[1]\n"
- "udot z29.s, z11.b, z5.b[3]\n"
- "udot z30.s, z11.b, z6.b[1]\n"
- "udot z31.s, z11.b, z6.b[3]\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "udot z24.s, z9.b, z4.b[1]\n"
- "udot z25.s, z9.b, z4.b[3]\n"
- ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
- "udot z26.s, z9.b, z5.b[1]\n"
- "udot z27.s, z9.b, z5.b[3]\n"
- ".inst 0x04b17739 // sqrdmulh z25.s, z25.s, z17.s\n"
- "udot z28.s, z8.b, z6.b[0]\n"
- "udot z29.s, z8.b, z6.b[2]\n"
- ".inst 0x04b1775a // sqrdmulh z26.s, z26.s, z17.s\n"
- "udot z30.s, z8.b, z7.b[0]\n"
- "udot z31.s, z8.b, z7.b[2]\n"
- ".inst 0x04b1777b // sqrdmulh z27.s, z27.s, z17.s\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "udot z28.s, z9.b, z6.b[1]\n"
- "udot z29.s, z9.b, z6.b[3]\n"
- "and z16.d, z24.d, z19.d\n"
- "udot z30.s, z9.b, z7.b[1]\n"
- "udot z31.s, z9.b, z7.b[3]\n"
- "and z18.d, z25.d, z19.d\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "asr z16.s, z16.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ "udot z24.s, z18.b, z3.b[1]\n"
+ "udot z25.s, z18.b, z3.b[3]\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "udot z26.s, z18.b, z4.b[1]\n"
+ "udot z27.s, z18.b, z4.b[3]\n"
+ "udot z28.s, z19.b, z5.b[0]\n"
+ "udot z29.s, z19.b, z5.b[2]\n"
+ "udot z30.s, z19.b, z6.b[0]\n"
+ "udot z31.s, z19.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z24.s, z17.b, z4.b[0]\n"
+ "udot z25.s, z17.b, z4.b[2]\n"
+ "udot z26.s, z17.b, z5.b[0]\n"
+ "udot z27.s, z17.b, z5.b[2]\n"
+ "udot z28.s, z18.b, z5.b[1]\n"
+ "udot z29.s, z18.b, z5.b[3]\n"
+ "udot z30.s, z18.b, z6.b[1]\n"
+ "udot z31.s, z18.b, z6.b[3]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+ "udot z24.s, z16.b, z4.b[1]\n"
+ "udot z25.s, z16.b, z4.b[3]\n"
+ ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
+ "udot z26.s, z16.b, z5.b[1]\n"
+ "udot z27.s, z16.b, z5.b[3]\n"
+ ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "udot z28.s, z17.b, z6.b[0]\n"
+ "udot z29.s, z17.b, z6.b[2]\n"
+ ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
+ "udot z30.s, z17.b, z7.b[0]\n"
+ "udot z31.s, z17.b, z7.b[2]\n"
+ ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+ "udot z28.s, z16.b, z6.b[1]\n"
+ "udot z29.s, z16.b, z6.b[3]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "udot z30.s, z16.b, z7.b[1]\n"
+ "udot z31.s, z16.b, z7.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #-3\n"
- ".inst 0x04b1779c // sqrdmulh z28.s, z28.s, z17.s\n"
- ".inst 0x04b177bd // sqrdmulh z29.s, z29.s, z17.s\n"
- ".inst 0x04b177de // sqrdmulh z30.s, z30.s, z17.s\n"
- ".inst 0x04b177ff // sqrdmulh z31.s, z31.s, z17.s\n"
- "and z17.d, z26.d, z19.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z24.s, z24.s, z16.s\n"
- "and z16.d, z27.d, z19.d\n"
- ".inst 0x44828a78 // srshl z24.s, p2/M, z24.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
+ ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
+ ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
+ ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828a79 // srshl z25.s, p2/M, z25.s, z19.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828a7a // srshl z26.s, p2/M, z26.s, z19.s\n"
- ".inst 0x44828a7b // srshl z27.s, p2/M, z27.s, z19.s\n"
- "and z16.d, z28.d, z19.d\n"
- "and z18.d, z29.d, z19.d\n"
- "and z17.d, z30.d, z19.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z28.s, z28.s, z16.s\n"
- "and z16.d, z31.d, z19.d\n"
- ".inst 0x44828a7c // srshl z28.s, p2/M, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828a7d // srshl z29.s, p2/M, z29.s, z19.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828a7e // srshl z30.s, p2/M, z30.s, z19.s\n"
- ".inst 0x44828a7f // srshl z31.s, p2/M, z31.s, z19.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
"smin z24.s, p2/M, z24.s, z15.s\n"
"smin z25.s, p2/M, z25.s, z15.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
"smin z26.s, p2/M, z26.s, z15.s\n"
"smin z27.s, p2/M, z27.s, z15.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
"smin z28.s, p2/M, z28.s, z15.s\n"
"smin z29.s, p2/M, z29.s, z15.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"smin z30.s, p2/M, z30.s, z15.s\n"
"smin z31.s, p2/M, z31.s, z15.s\n"
- "smax z24.s, p2/M, z24.s, z13.s\n"
- "smax z25.s, p2/M, z25.s, z13.s\n"
- "st1b { z24.s }, p0, [x27, x28]\n"
+ "smax z24.s, p2/M, z24.s, z14.s\n"
+ "smax z25.s, p2/M, z25.s, z14.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z13.s\n"
- "smax z27.s, p2/M, z27.s, z13.s\n"
- "st1b { z25.s }, p0, [x26, x28]\n"
+ "smax z26.s, p2/M, z26.s, z14.s\n"
+ "smax z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z13.s\n"
- "smax z29.s, p2/M, z29.s, z13.s\n"
- "st1b { z26.s }, p0, [x25, x28]\n"
+ "smax z28.s, p2/M, z28.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z14.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z13.s\n"
- "smax z31.s, p2/M, z31.s, z13.s\n"
- "st1b { z27.s }, p0, [x24, x28]\n"
+ "smax z30.s, p2/M, z30.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z14.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
- "st1b { z28.s }, p0, [x23, x28]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z12.s\n"
- "st1b { z29.s }, p0, [x22, x28]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z12.s\n"
- "st1b { z30.s }, p0, [x21, x28]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z12.s\n"
- "st1b { z31.s }, p0, [x20, x28]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z12.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 3d475daf72..0f1030c0d7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfi
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index ff3ec0ba48..887eccf1e9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,320 +91,320 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x8, #0x0\n"
+ "mov x16, #0x0\n"
"ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
"ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x8\n"
+ "mov x23, x16\n"
"add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x15, x14, [x24, #0x0]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
"incw x23\n"
- "whilelt p3.h, x8, x17\n"
- "ldp x13, x12, [x24, #0x10]\n"
- "whilelt p2.s, x8, x17\n"
- "whilelt p1.s, x23, x17\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "mov z26.d, z13.d\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z24.d, z13.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z16.d, z17.d\n"
- "mov z25.d, z13.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z9.d, z17.d\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"1:" // Loop
- ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x22, [x11, #0x28]\n"
- "ldr x27, [x11, #0x38]\n"
- ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- "ldr x21, [x11, #0x30]\n"
- "ldr x26, [x11, #0x40]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x11, #0x48]\n"
- "ld1b { z30.h }, p3/Z, [x20, x8]\n"
- ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
- "ldr x25, [x11, #0x50]\n"
- "ldr x24, [x11, #0x58]\n"
- ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- "ldr x23, [x11, #0x60]\n"
- ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- "ldr x22, [x11, #0x68]\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
- "ldr x20, [x11, #0x78]\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z19.s, z20.s, z18.s\n"
- ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
- ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
- "uzp2 z22.s, z20.s, z18.s\n"
- "ld1w { z20.s }, p2/Z, [x28]\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
- "ld1b { z31.h }, p3/Z, [x26, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "whilelt p0.h, x10, x17\n"
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
"inch x16\n"
- ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
- "addvl x9, x9, #2\n"
- ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x8]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
- "addvl x28, x28, #2\n"
- ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
- "ld1b { z31.h }, p3/Z, [x23, x8]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
- "uzp1 z1.s, z20.s, z18.s\n"
- ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- "uzp2 z27.s, z20.s, z18.s\n"
- ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x8]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x8]\n"
- ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
- ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x8]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "inch x8\n"
- ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z21.d, z13.d, z1.d\n"
- "mov x20, x8\n"
- ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
"incw x20\n"
- ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
- "asr z21.s, z21.s, #0x1f\n"
- "whilelt p2.s, x8, x17\n"
- ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
- ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "and z20.d, z17.d, z27.d\n"
- "whilelt p1.s, x20, x17\n"
- ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
- ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
- "whilelt p3.h, x8, x17\n"
- ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
- "sqadd z13.s, z13.s, z21.s\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z26.d, z1.d\n"
- ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
- "and z18.d, z24.d, z1.d\n"
- ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
- "and z21.d, z25.d, z1.d\n"
- ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z10.d, z27.d\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z22.d, z16.d, z27.d\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z20.d, z9.d, z27.d\n"
- "sqadd z26.s, z26.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "asr z22.s, z22.s, #0x1f\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- "sqadd z25.s, z25.s, z21.s\n"
+ "and z7.d, z25.d, z1.d\n"
"asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- "sqadd z10.s, z10.s, z2.s\n"
- "sqadd z16.s, z16.s, z22.s\n"
- ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
- ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
- "sqadd z9.s, z9.s, z20.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
- ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
- ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
- ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
- ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
- ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z12.h\n"
- "smin z13.h, p4/M, z13.h, z11.h\n"
- "sqadd z26.h, z26.h, z14.h\n"
- "sqadd z24.h, z24.h, z14.h\n"
- "smax z26.h, p4/M, z26.h, z12.h\n"
- "smax z24.h, p4/M, z24.h, z12.h\n"
- "sqadd z25.h, z25.h, z14.h\n"
- "smax z25.h, p4/M, z25.h, z12.h\n"
- "smin z26.h, p4/M, z26.h, z11.h\n"
- "st1b { z13.h }, p0, [x15, x10]\n"
- "smin z24.h, p4/M, z24.h, z11.h\n"
- "smin z25.h, p4/M, z25.h, z11.h\n"
- "st1b { z26.h }, p0, [x14, x10]\n"
- "st1b { z24.h }, p0, [x13, x10]\n"
- "st1b { z25.h }, p0, [x12, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x16]\n"
- "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
- ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
- "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
- ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
- "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- "inch x16, ALL, MUL #8\n"
- ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
- "ld1w { z17.s }, p2/Z, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z13.s, z17.s, z16.s\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x24, x23, [x11, #0x0]\n"
- "addvl x26, x26, #2\n"
- "str x26, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x11, #0x10]\n"
- "ldr x20, [x11, #0x20]\n"
- "mov z26.d, z13.d\n"
- "mov z10.d, z17.d\n"
- "ld1b { z31.h }, p3/Z, [x24, x8]\n"
- "ld1b { z30.h }, p3/Z, [x23, x8]\n"
- "mov z24.d, z13.d\n"
- "mov z16.d, z17.d\n"
- "ld1b { z29.h }, p3/Z, [x22, x8]\n"
- "ld1b { z28.h }, p3/Z, [x21, x8]\n"
- "mov z25.d, z13.d\n"
- "mov z9.d, z17.d\n"
- "ld1b { z27.h }, p3/Z, [x20, x8]\n"
- ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
- ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
- ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
- ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 9a3db20f73..79e3fd5f54 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfi
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 24c4bf713d..754d06d443 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -110,13 +110,13 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
"add x20, x25, %[offsetof_Requantize32_b_offset]\n"
"add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z23.b }, p4/Z, [x21]\n"
- "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
"add x21, x25, %[offsetof_Requantize32_minval]\n"
"add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z14.h }, p4/Z, [x22]\n"
- "ld1rh { z16.h }, p4/Z, [x21]\n"
- "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
"ldp x16, x15, [x24, #0x0]\n"
"incw x23\n"
"whilelt p3.h, x7, x8\n"
@@ -124,320 +124,320 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"whilelt p2.s, x7, x8\n"
"whilelt p1.s, x23, x8\n"
"ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"add x11, %x[params], %[offsetof_Params_inptrs]\n"
"mov x10, #0x0\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
"addvl x12, x12, #2\n"
- "mov z9.d, z13.d\n"
+ "mov z18.d, z8.d\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z10.d, z17.d\n"
- "mov z11.d, z13.d\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z22.d, z17.d\n"
- "mov z21.d, z13.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z18.d, z17.d\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
"str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"1:" // Loop
- ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
- "ldr x25, [x11, #0x40]\n"
- "ldr x24, [x11, #0x48]\n"
- ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
- ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
- "ldr x22, [x11, #0x50]\n"
- "ldr x20, [x11, #0x58]\n"
- ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ldr x23, [x11, #0x78]\n"
- "ldr x21, [x11, #0x60]\n"
- ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
- ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
- "ldr x22, [x11, #0x80]\n"
- "ldr x20, [x11, #0x68]\n"
- ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ldr x21, [x11, #0x88]\n"
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x21, x7]\n"
"ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x11, #0x70]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
- "ldr x25, [x11, #0x98]\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- "ldr x24, [x11, #0x90]\n"
- ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
- "ldr x23, [x11, #0xa8]\n"
- "ldr x20, [x11, #0xa0]\n"
- ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
- "ldr x22, [x11, #0xb0]\n"
- "ldr x21, [x11, #0xb8]\n"
- ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z31.s }, p2/Z, [x27]\n"
- ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
- "uzp1 z19.s, z31.s, z20.s\n"
- ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
- ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
- "uzp2 z30.s, z31.s, z20.s\n"
- "ld1w { z31.s }, p2/Z, [x26]\n"
- ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x7]\n"
- ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
- ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
- "uzp1 z1.s, z31.s, z20.s\n"
- ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
- ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
"ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
- "uzp2 z31.s, z31.s, z20.s\n"
- "inch x17\n"
- ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
- "and z0.d, z13.d, z1.d\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
"inch x7\n"
- ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
- ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
"mov x20, x7\n"
- ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
- "asr z0.s, z0.s, #0x1f\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
"incw x20\n"
- ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
- "and z20.d, z17.d, z31.d\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
"whilelt p2.s, x7, x8\n"
- ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
"whilelt p1.s, x20, x8\n"
- ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
"whilelt p3.h, x7, x8\n"
- "sqadd z13.s, z13.s, z0.s\n"
- "asr z20.s, z20.s, #0x1f\n"
- ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
- "addvl x27, x27, #2\n"
- "and z19.d, z9.d, z1.d\n"
- ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
- "addvl x26, x26, #2\n"
- "and z2.d, z11.d, z1.d\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "and z0.d, z21.d, z1.d\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- "sqadd z17.s, z17.s, z20.s\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
- "and z3.d, z10.d, z31.d\n"
- "asr z2.s, z2.s, #0x1f\n"
- "and z26.d, z22.d, z31.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z20.d, z18.d, z31.d\n"
- "sqadd z9.s, z9.s, z19.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z11.s, z11.s, z2.s\n"
- ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z21.s, z21.s, z0.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
"asr z20.s, z20.s, #0x1f\n"
- "sqadd z10.s, z10.s, z3.s\n"
- ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
- "sqadd z22.s, z22.s, z26.s\n"
- "sqadd z18.s, z18.s, z20.s\n"
- ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
- ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
- ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
- ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
- ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
- ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
- ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
- ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
- ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
- "sqadd z13.h, z13.h, z14.h\n"
- "sqadd z9.h, z9.h, z14.h\n"
- "smax z13.h, p4/M, z13.h, z16.h\n"
- "smax z9.h, p4/M, z9.h, z16.h\n"
- "sqadd z11.h, z11.h, z14.h\n"
- "sqadd z21.h, z21.h, z14.h\n"
- "smax z11.h, p4/M, z11.h, z16.h\n"
- "smax z21.h, p4/M, z21.h, z16.h\n"
- "smin z13.h, p4/M, z13.h, z15.h\n"
- "smin z9.h, p4/M, z9.h, z15.h\n"
- "st1b { z13.h }, p0, [x16, x10]\n"
- "smin z11.h, p4/M, z11.h, z15.h\n"
- "smin z21.h, p4/M, z21.h, z15.h\n"
- "st1b { z9.h }, p0, [x15, x10]\n"
- "st1b { z11.h }, p0, [x14, x10]\n"
- "st1b { z21.h }, p0, [x13, x10]\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
"inch x10\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
"inch x17, ALL, MUL #8\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ld1w { z18.s }, p2/Z, [x12]\n"
- "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z8.s\n"
- "uzp2 z17.s, z18.s, z8.s\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- "ldp x9, x28, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
"ldp x25, x24, [x11, #0x10]\n"
"ldp x23, x22, [x11, #0x20]\n"
- "mov z9.d, z13.d\n"
- "mov z10.d, z17.d\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
"ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x9, x7]\n"
- "mov z11.d, z13.d\n"
- "mov z22.d, z17.d\n"
- "ld1b { z30.h }, p3/Z, [x28, x7]\n"
- "ld1b { z29.h }, p3/Z, [x25, x7]\n"
- "mov z21.d, z13.d\n"
- "mov z18.d, z17.d\n"
- "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
- ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ld1b { z26.h }, p3/Z, [x22, x7]\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
- ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ld1b { z24.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
- ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
- ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
- ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
- ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
- ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
- ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
- ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
@@ -448,4 +448,4 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 06ca42eed9..0ff853ec2d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
#include <cstdint>
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -47,17 +47,16 @@ class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfi
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
- unsigned int get_accumulator_depth_vl(void) const override { return 2; }
-
sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 9c291ae186..f24a258484 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -46,7 +46,7 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,542 +111,542 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x0, #0x0\n"
- "mov x24, x0\n"
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"incw x24\n"
- "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
"add x21, x23, %[offsetof_Requantize32_a_offset]\n"
"add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z15.b }, p4/Z, [x21]\n"
- "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
- "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z11.h }, p4/Z, [x20]\n"
- "ldp x3, x4, [x22, #0x0]\n"
- "whilelt p3.h, x0, x1\n"
- "ldp x5, x6, [x22, #0x10]\n"
- "whilelt p2.s, x0, x1\n"
- "whilelt p1.s, x24, x1\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "mov x8, #0x0\n"
- "mov z20.d, z14.d\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z7.d, z10.d\n"
- "mov z8.d, z14.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z16.d, z10.d\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
"mov z6.d, z14.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
- "mov z5.d, z10.d\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"1:" // Loop
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x7, #0x50]\n"
- "ld1b { z31.h }, p3/Z, [x20, x0]\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x7, #0x58]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x21, [x7, #0x60]\n"
- "ldr x20, [x7, #0x68]\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
- "ldr x25, [x7, #0x70]\n"
- "ldr x24, [x7, #0x78]\n"
- ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x15, [x7, #0x80]\n"
- "ldr x23, [x7, #0x88]\n"
- ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
- "ld1b { z27.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- "ldr x22, [x7, #0x90]\n"
- "ldr x21, [x7, #0x98]\n"
- ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- "ldr x14, [x7, #0xa0]\n"
- "ldr x13, [x7, #0xa8]\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- "ldr x12, [x7, #0xb0]\n"
- "ldr x20, [x7, #0xb8]\n"
- ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- "ldr x11, [x7, #0xc0]\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
- "ldr x10, [x7, #0xc8]\n"
- "ldr x9, [x7, #0xd0]\n"
- ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- "ldr x28, [x7, #0xd8]\n"
- "ldr x27, [x7, #0xe0]\n"
- ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
- ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
- "ldr x26, [x7, #0xe8]\n"
- "ldr x25, [x7, #0xf0]\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- "ld1w { z19.s }, p2/Z, [x17]\n"
- "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
- "ldr x24, [x7, #0xf8]\n"
- "uzp1 z9.s, z19.s, z18.s\n"
- ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- "uzp2 z29.s, z19.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16]\n"
- ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
- "ld1b { z23.h }, p3/Z, [x15, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
- "ldr x23, [x7, #0x100]\n"
- "whilelt p0.h, x8, x1\n"
- ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- "addvl x17, x17, #2\n"
- ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
- "ldr x22, [x7, #0x108]\n"
- ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
- "ld1b { z30.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
- "ldr x21, [x7, #0x110]\n"
- ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- "ld1b { z26.h }, p3/Z, [x14, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
- ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- "ldr x20, [x7, #0x118]\n"
- "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x13, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
- ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
- ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x12, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x11, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
- "ld1b { z23.h }, p3/Z, [x10, x0]\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
- "ld1b { z26.h }, p3/Z, [x26, x0]\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
- ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
- "ld1b { z25.h }, p3/Z, [x25, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
- "ld1b { z24.h }, p3/Z, [x24, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
- ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
- "ld1sb { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
- "inch x2, ALL, MUL #8\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
- "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
- ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
- ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
- "ld1sb { z4.h }, p4/Z, [x2]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- "inch x2\n"
- ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
- "uzp1 z23.s, z19.s, z18.s\n"
- ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
- "uzp2 z22.s, z19.s, z18.s\n"
- ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x21, x0]\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
- ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
- "inch x0\n"
- ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
- "and z21.d, z14.d, z23.d\n"
- "mov x20, x0\n"
- ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
"incw x20\n"
- ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
- ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
- "whilelt p2.s, x0, x1\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
- "and z3.d, z10.d, z22.d\n"
- "whilelt p1.s, x20, x1\n"
- ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
- ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
- "whilelt p3.h, x0, x1\n"
- ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
- "sqadd z14.s, z14.s, z21.s\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
"asr z3.s, z3.s, #0x1f\n"
- ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
- "and z19.d, z20.d, z23.d\n"
- ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
- "and z18.d, z8.d, z23.d\n"
- ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
- "and z21.d, z6.d, z23.d\n"
- ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
- "sqadd z10.s, z10.s, z3.s\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
- "and z1.d, z7.d, z22.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z2.d, z16.d, z22.d\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
"asr z21.s, z21.s, #0x1f\n"
- "and z3.d, z5.d, z22.d\n"
- "sqadd z20.s, z20.s, z19.s\n"
- ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
- "asr z1.s, z1.s, #0x1f\n"
- "sqadd z8.s, z8.s, z18.s\n"
- ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "sqadd z6.s, z6.s, z21.s\n"
- ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "sqadd z7.s, z7.s, z1.s\n"
- ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
- "sqadd z16.s, z16.s, z2.s\n"
- "sqadd z5.s, z5.s, z3.s\n"
- ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
- ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
- ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
- ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
- ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
- "sqadd z14.h, z14.h, z12.h\n"
- "sqadd z20.h, z20.h, z12.h\n"
- "smax z14.h, p4/M, z14.h, z13.h\n"
- "smax z20.h, p4/M, z20.h, z13.h\n"
- "sqadd z8.h, z8.h, z12.h\n"
- "sqadd z6.h, z6.h, z12.h\n"
- "smax z8.h, p4/M, z8.h, z13.h\n"
- "smax z6.h, p4/M, z6.h, z13.h\n"
- "smin z14.h, p4/M, z14.h, z11.h\n"
- "smin z20.h, p4/M, z20.h, z11.h\n"
- "st1b { z14.h }, p0, [x3, x8]\n"
- "smin z8.h, p4/M, z8.h, z11.h\n"
- "smin z6.h, p4/M, z6.h, z11.h\n"
- "st1b { z20.h }, p0, [x4, x8]\n"
- "st1b { z8.h }, p0, [x5, x8]\n"
- "st1b { z6.h }, p0, [x6, x8]\n"
- "ld1w { z30.s }, p2/Z, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z14.s, z30.s, z16.s\n"
- "ld1sb { z0.h }, p4/Z, [x2]\n"
- "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
- "uzp2 z10.s, z30.s, z16.s\n"
- "addvl x14, x14, #2\n"
- "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
- "inch x8\n"
- "str x14, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
- "ldp x9, x28, [x7, #0x0]\n"
- "mov z20.d, z14.d\n"
- "mov z7.d, z10.d\n"
- "ldp x27, x26, [x7, #0x10]\n"
- "ldp x25, x24, [x7, #0x20]\n"
- "mov z8.d, z14.d\n"
- "mov z16.d, z10.d\n"
- "ldp x23, x22, [x7, #0x30]\n"
- "ldp x21, x20, [x7, #0x40]\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
"mov z6.d, z14.d\n"
- "mov z5.d, z10.d\n"
- "ld1b { z31.h }, p3/Z, [x9, x0]\n"
- "ld1b { z30.h }, p3/Z, [x28, x0]\n"
- ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
- ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
- "ld1b { z29.h }, p3/Z, [x27, x0]\n"
- "ld1b { z28.h }, p3/Z, [x26, x0]\n"
- ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
- ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
- "ld1b { z27.h }, p3/Z, [x25, x0]\n"
- "ld1b { z23.h }, p3/Z, [x24, x0]\n"
- ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
- ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
- "ld1b { z25.h }, p3/Z, [x23, x0]\n"
- "ld1b { z24.h }, p3/Z, [x22, x0]\n"
- ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
- ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x0]\n"
- "ld1b { z22.h }, p3/Z, [x20, x0]\n"
- ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
- ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
- ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
- ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
- ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
- ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
- ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
index e9b29ca877..b1fe66cea2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,7 +91,7 @@
#include "depthwise.hpp"
#include "depthfirst_driver.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
namespace arm_conv {
namespace depthwise {
diff --git a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
index 8473fc0838..b0aa62bbcb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#pragma once
#include "pooling.hpp"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
namespace arm_conv {
namespace pooling {
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index a670bb81bb..6b3ebe6664 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
@@ -48,4 +48,4 @@ struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 647103d3a4..5df848d1dd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -196,38 +196,38 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x5, x5, #0x10\n"
"cbz x3, 4f\n"
"3:" // Oddments
- "ldr h6, [x11, x4]\n"
- "ldr h5, [x10, x4]\n"
- "fadd v17.8h, v6.8h, v5.8h\n"
+ "ldr h17, [x11, x4]\n"
+ "ldr h16, [x10, x4]\n"
+ "fadd v18.8h, v17.8h, v16.8h\n"
"subs x3, x3, #0x1\n"
- "ldr h4, [x27, x4]\n"
- "ldr h3, [x26, x4]\n"
- "fadd v16.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v17.8h, v16.8h\n"
- "ldr h2, [x15, x4]\n"
- "ldr h1, [x14, x4]\n"
- "fadd v18.8h, v2.8h, v1.8h\n"
- "fadd v21.8h, v18.8h, v19.8h\n"
- "ldr h0, [x12, x4]\n"
- "ldr h31, [x28, x4]\n"
- "fadd v17.8h, v0.8h, v31.8h\n"
- "ldr h30, [x9, x4]\n"
- "ldr h29, [x25, x4]\n"
- "fadd v22.8h, v30.8h, v29.8h\n"
- "ldr h28, [x23, x4]\n"
- "ldr h27, [x22, x4]\n"
- "fadd v16.8h, v28.8h, v27.8h\n"
- "fadd v20.8h, v16.8h, v19.8h\n"
- "ldr h26, [x16, x4]\n"
- "ldr h25, [x13, x4]\n"
- "fadd v19.8h, v26.8h, v17.8h\n"
- "fadd v18.8h, v25.8h, v22.8h\n"
- "ldr h24, [x24, x4]\n"
- "ldr h23, [x21, x4]\n"
- "fadd v17.8h, v24.8h, v17.8h\n"
- "fadd v16.8h, v23.8h, v22.8h\n"
- "fadd v19.8h, v21.8h, v19.8h\n"
- "fadd v18.8h, v21.8h, v18.8h\n"
+ "ldr h17, [x27, x4]\n"
+ "ldr h16, [x26, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v18.8h, v18.8h, v16.8h\n"
+ "ldr h17, [x15, x4]\n"
+ "ldr h16, [x14, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v23.8h, v16.8h, v18.8h\n"
+ "ldr h17, [x12, x4]\n"
+ "ldr h16, [x28, x4]\n"
+ "fadd v22.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x9, x4]\n"
+ "ldr h16, [x25, x4]\n"
+ "fadd v21.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x23, x4]\n"
+ "ldr h16, [x22, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v20.8h, v16.8h, v18.8h\n"
+ "ldr h17, [x16, x4]\n"
+ "ldr h16, [x13, x4]\n"
+ "fadd v19.8h, v17.8h, v22.8h\n"
+ "fadd v18.8h, v16.8h, v21.8h\n"
+ "ldr h17, [x24, x4]\n"
+ "ldr h16, [x21, x4]\n"
+ "fadd v17.8h, v17.8h, v22.8h\n"
+ "fadd v16.8h, v16.8h, v21.8h\n"
+ "fadd v19.8h, v23.8h, v19.8h\n"
+ "fadd v18.8h, v23.8h, v18.8h\n"
"add x4, x4, #0x2\n"
"fadd v17.8h, v17.8h, v20.8h\n"
"fadd v16.8h, v16.8h, v20.8h\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 44adb4ffcf..f7be92e53f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
-
#include <cstdint>
#include <cstddef>
@@ -45,77 +44,77 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v9.8h }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x20\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd v23.8h, v4.8h, v3.8h\n"
"fadd v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fadd v22.8h, v2.8h, v1.8h\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fadd v18.8h, v27.8h, v21.8h\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fadd v21.8h, v0.8h, v31.8h\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fadd v17.8h, v26.8h, v20.8h\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fadd v20.8h, v30.8h, v29.8h\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fadd v16.8h, v25.8h, v24.8h\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fadd v19.8h, v23.8h, v19.8h\n"
"fadd v18.8h, v22.8h, v18.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fadd v17.8h, v21.8h, v17.8h\n"
"fadd v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fadd v8.8h, v8.8h, v19.8h\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fadd v7.8h, v7.8h, v18.8h\n"
"fadd v6.8h, v6.8h, v17.8h\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fadd v5.8h, v5.8h, v16.8h\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd v23.8h, v4.8h, v3.8h\n"
@@ -138,16 +137,16 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v4.8h\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fadd v7.8h, v7.8h, v2.8h\n"
- "fadd v6.8h, v6.8h, v0.8h\n"
- "ldr q30, [x24, x26]\n"
- "fadd v5.8h, v5.8h, v30.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fadd v7.8h, v7.8h, v17.8h\n"
+ "fadd v6.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x20, x23]\n"
+ "fadd v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x20\n"
@@ -156,14 +155,14 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"fmul v7.8h, v7.8h, v9.8h\n"
"fmul v6.8h, v6.8h, v9.8h\n"
"fmul v5.8h, v5.8h, v9.8h\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
@@ -172,146 +171,146 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v4.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x8\n"
"cmp %x[n_channels], #0x8\n"
"fmul v8.8h, v8.8h, v9.8h\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
"subs x25, x25, #0x1\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
"subs x21, x21, #0x1\n"
"fadd v8.8h, v8.8h, v4.8h\n"
@@ -342,7 +341,7 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"31:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 23a9164b76..b65ac7e9fa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
@@ -48,4 +48,4 @@ struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 8041453cb1..4b073b9076 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
+
#include <cstddef>
#include <cstdint>
@@ -111,7 +112,7 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"fmax v18.8h, v18.8h, v21.8h\n"
"fmax v17.8h, v17.8h, v20.8h\n"
"add x15, x15, #0x10\n"
- "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v16.8h, v20.8h, v16.8h\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"fmax v21.8h, v30.8h, v29.8h\n"
"fmax v20.8h, v29.8h, v28.8h\n"
- "fmax v19.8h, v27.8h, v26.8h\n"
+ "fmax v16.8h, v27.8h, v26.8h\n"
"fmax v18.8h, v25.8h, v24.8h\n"
"fmax v17.8h, v27.8h, v23.8h\n"
- "fmax v16.8h, v24.8h, v22.8h\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
+ "fmax v19.8h, v24.8h, v22.8h\n"
+ "fmax v16.8h, v21.8h, v16.8h\n"
"fmax v18.8h, v18.8h, v21.8h\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"fmax v17.8h, v17.8h, v20.8h\n"
- "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v16.8h, v20.8h, v19.8h\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr h30, [x28, x15]\n"
- "ldr h29, [x25, x15]\n"
- "fmax v21.8h, v30.8h, v29.8h\n"
+ "ldr h16, [x28, x15]\n"
+ "ldr h17, [x25, x15]\n"
+ "fmax v23.8h, v16.8h, v17.8h\n"
"subs x16, x16, #0x1\n"
- "ldr h28, [x22, x15]\n"
- "ldr h27, [x26, x15]\n"
- "fmax v20.8h, v29.8h, v28.8h\n"
- "ldr h26, [x9, x15]\n"
- "ldr h25, [x27, x15]\n"
- "fmax v19.8h, v27.8h, v26.8h\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
- "ldr h24, [x24, x15]\n"
- "ldr h23, [x23, x15]\n"
- "fmax v18.8h, v25.8h, v24.8h\n"
- "fmax v17.8h, v27.8h, v23.8h\n"
- "ldr h22, [x21, x15]\n"
- "fmax v16.8h, v24.8h, v22.8h\n"
+ "ldr h16, [x22, x15]\n"
+ "ldr h22, [x26, x15]\n"
+ "fmax v21.8h, v17.8h, v16.8h\n"
+ "ldr h16, [x9, x15]\n"
+ "ldr h17, [x27, x15]\n"
+ "fmax v16.8h, v22.8h, v16.8h\n"
+ "fmax v20.8h, v23.8h, v16.8h\n"
+ "ldr h19, [x24, x15]\n"
+ "ldr h16, [x23, x15]\n"
+ "fmax v18.8h, v17.8h, v19.8h\n"
+ "fmax v17.8h, v22.8h, v16.8h\n"
+ "ldr h16, [x21, x15]\n"
+ "fmax v16.8h, v19.8h, v16.8h\n"
"add x15, x15, #0x2\n"
- "fmax v18.8h, v18.8h, v21.8h\n"
- "fmax v17.8h, v17.8h, v20.8h\n"
- "fmax v16.8h, v16.8h, v20.8h\n"
- "str h19, [x14, x12]\n"
+ "fmax v18.8h, v18.8h, v23.8h\n"
+ "fmax v17.8h, v17.8h, v21.8h\n"
+ "fmax v16.8h, v21.8h, v16.8h\n"
+ "str h20, [x14, x12]\n"
"str h18, [x13, x12]\n"
"str h17, [x11, x12]\n"
"str h16, [x10, x12]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
index e4de9fb79c..c92e2cdebd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -41,10 +41,10 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x20\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xfc00\n"
@@ -53,66 +53,66 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"dup v7.8h, w20\n"
"dup v6.8h, w20\n"
"dup v5.8h, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fmax v23.8h, v4.8h, v3.8h\n"
"fmax v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fmax v22.8h, v2.8h, v1.8h\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fmax v18.8h, v27.8h, v21.8h\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fmax v21.8h, v0.8h, v31.8h\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fmax v17.8h, v26.8h, v20.8h\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fmax v20.8h, v30.8h, v29.8h\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fmax v16.8h, v25.8h, v24.8h\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fmax v19.8h, v23.8h, v19.8h\n"
"fmax v18.8h, v22.8h, v18.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fmax v17.8h, v21.8h, v17.8h\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fmax v8.8h, v8.8h, v19.8h\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fmax v7.8h, v7.8h, v18.8h\n"
"fmax v6.8h, v6.8h, v17.8h\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fmax v5.8h, v5.8h, v16.8h\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fmax v23.8h, v4.8h, v3.8h\n"
@@ -135,28 +135,28 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v4.8h\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fmax v7.8h, v7.8h, v2.8h\n"
- "fmax v6.8h, v6.8h, v0.8h\n"
- "ldr q30, [x24, x26]\n"
- "fmax v5.8h, v5.8h, v30.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fmax v7.8h, v7.8h, v17.8h\n"
+ "fmax v6.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x20, x23]\n"
+ "fmax v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x20\n"
"cmp %x[n_channels], #0x20\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
@@ -166,146 +166,146 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.8h, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v4.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x8\n"
"cmp %x[n_channels], #0x8\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.8h, w20\n"
- "add %x[outptr], %x[outptr], x9\n"
- "mov x20, %x[inptrs]\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
"subs x25, x25, #0x1\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
"subs x21, x21, #0x1\n"
"fmax v8.8h, v8.8h, v4.8h\n"
@@ -335,7 +335,7 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"31:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 813e685606..7add5feb1d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 9db65d62b0..cf0047638e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -22,12 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <algorithm>
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -196,38 +196,38 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x5, x5, #0x10\n"
"cbz x3, 4f\n"
"3:" // Oddments
- "ldr s6, [x11, x4]\n"
- "ldr s5, [x10, x4]\n"
- "fadd v17.4s, v6.4s, v5.4s\n"
+ "ldr s17, [x11, x4]\n"
+ "ldr s16, [x10, x4]\n"
+ "fadd v18.4s, v17.4s, v16.4s\n"
"subs x3, x3, #0x1\n"
- "ldr s4, [x27, x4]\n"
- "ldr s3, [x26, x4]\n"
- "fadd v16.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v17.4s, v16.4s\n"
- "ldr s2, [x15, x4]\n"
- "ldr s1, [x14, x4]\n"
- "fadd v18.4s, v2.4s, v1.4s\n"
- "fadd v21.4s, v18.4s, v19.4s\n"
- "ldr s0, [x12, x4]\n"
- "ldr s31, [x28, x4]\n"
- "fadd v17.4s, v0.4s, v31.4s\n"
- "ldr s30, [x9, x4]\n"
- "ldr s29, [x25, x4]\n"
- "fadd v22.4s, v30.4s, v29.4s\n"
- "ldr s28, [x23, x4]\n"
- "ldr s27, [x22, x4]\n"
- "fadd v16.4s, v28.4s, v27.4s\n"
- "fadd v20.4s, v16.4s, v19.4s\n"
- "ldr s26, [x16, x4]\n"
- "ldr s25, [x13, x4]\n"
- "fadd v19.4s, v26.4s, v17.4s\n"
- "fadd v18.4s, v25.4s, v22.4s\n"
- "ldr s24, [x24, x4]\n"
- "ldr s23, [x21, x4]\n"
- "fadd v17.4s, v24.4s, v17.4s\n"
- "fadd v16.4s, v23.4s, v22.4s\n"
- "fadd v19.4s, v21.4s, v19.4s\n"
- "fadd v18.4s, v21.4s, v18.4s\n"
+ "ldr s17, [x27, x4]\n"
+ "ldr s16, [x26, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v18.4s, v18.4s, v16.4s\n"
+ "ldr s17, [x15, x4]\n"
+ "ldr s16, [x14, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v23.4s, v16.4s, v18.4s\n"
+ "ldr s17, [x12, x4]\n"
+ "ldr s16, [x28, x4]\n"
+ "fadd v22.4s, v17.4s, v16.4s\n"
+ "ldr s17, [x9, x4]\n"
+ "ldr s16, [x25, x4]\n"
+ "fadd v21.4s, v17.4s, v16.4s\n"
+ "ldr s17, [x23, x4]\n"
+ "ldr s16, [x22, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v20.4s, v16.4s, v18.4s\n"
+ "ldr s17, [x16, x4]\n"
+ "ldr s16, [x13, x4]\n"
+ "fadd v19.4s, v17.4s, v22.4s\n"
+ "fadd v18.4s, v16.4s, v21.4s\n"
+ "ldr s17, [x24, x4]\n"
+ "ldr s16, [x21, x4]\n"
+ "fadd v17.4s, v17.4s, v22.4s\n"
+ "fadd v16.4s, v16.4s, v21.4s\n"
+ "fadd v19.4s, v23.4s, v19.4s\n"
+ "fadd v18.4s, v23.4s, v18.4s\n"
"add x4, x4, #0x4\n"
"fadd v17.4s, v17.4s, v20.4s\n"
"fadd v16.4s, v16.4s, v20.4s\n"
@@ -250,4 +250,5 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 3f90610591..d236f07b1c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -44,77 +44,77 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
__asm__ __volatile__(
"ld1r { v9.4s }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x10\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd v23.4s, v4.4s, v3.4s\n"
"fadd v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fadd v22.4s, v2.4s, v1.4s\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fadd v18.4s, v27.4s, v21.4s\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fadd v21.4s, v0.4s, v31.4s\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fadd v17.4s, v26.4s, v20.4s\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fadd v20.4s, v30.4s, v29.4s\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fadd v16.4s, v25.4s, v24.4s\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fadd v19.4s, v23.4s, v19.4s\n"
"fadd v18.4s, v22.4s, v18.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fadd v17.4s, v21.4s, v17.4s\n"
"fadd v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fadd v8.4s, v8.4s, v19.4s\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fadd v7.4s, v7.4s, v18.4s\n"
"fadd v6.4s, v6.4s, v17.4s\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fadd v5.4s, v5.4s, v16.4s\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd v23.4s, v4.4s, v3.4s\n"
@@ -137,16 +137,16 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v4.4s\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fadd v7.4s, v7.4s, v2.4s\n"
- "fadd v6.4s, v6.4s, v0.4s\n"
- "ldr q30, [x24, x26]\n"
- "fadd v5.4s, v5.4s, v30.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fadd v7.4s, v7.4s, v17.4s\n"
+ "fadd v6.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x20, x23]\n"
+ "fadd v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
@@ -155,14 +155,14 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"fmul v7.4s, v7.4s, v9.4s\n"
"fmul v6.4s, v6.4s, v9.4s\n"
"fmul v5.4s, v5.4s, v9.4s\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
@@ -171,110 +171,110 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v4.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x4\n"
"cmp %x[n_channels], #0x4\n"
"fmul v8.4s, v8.4s, v9.4s\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
"subs x25, x25, #0x1\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
"subs x21, x21, #0x1\n"
"fadd v8.4s, v8.4s, v4.4s\n"
@@ -293,10 +293,11 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"25:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 4bf5770857..2f72b59d70 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 2e7fb3c5b1..f4202de1ed 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,11 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -111,7 +112,7 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"fmax v18.4s, v18.4s, v21.4s\n"
"fmax v17.4s, v17.4s, v20.4s\n"
"add x15, x15, #0x10\n"
- "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v16.4s, v20.4s, v16.4s\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"fmax v21.4s, v30.4s, v29.4s\n"
"fmax v20.4s, v29.4s, v28.4s\n"
- "fmax v19.4s, v27.4s, v26.4s\n"
+ "fmax v16.4s, v27.4s, v26.4s\n"
"fmax v18.4s, v25.4s, v24.4s\n"
"fmax v17.4s, v27.4s, v23.4s\n"
- "fmax v16.4s, v24.4s, v22.4s\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
+ "fmax v19.4s, v24.4s, v22.4s\n"
+ "fmax v16.4s, v21.4s, v16.4s\n"
"fmax v18.4s, v18.4s, v21.4s\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"fmax v17.4s, v17.4s, v20.4s\n"
- "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v16.4s, v20.4s, v19.4s\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr s30, [x28, x15]\n"
- "ldr s29, [x25, x15]\n"
- "fmax v21.4s, v30.4s, v29.4s\n"
+ "ldr s16, [x28, x15]\n"
+ "ldr s17, [x25, x15]\n"
+ "fmax v23.4s, v16.4s, v17.4s\n"
"subs x16, x16, #0x1\n"
- "ldr s28, [x22, x15]\n"
- "ldr s27, [x26, x15]\n"
- "fmax v20.4s, v29.4s, v28.4s\n"
- "ldr s26, [x9, x15]\n"
- "ldr s25, [x27, x15]\n"
- "fmax v19.4s, v27.4s, v26.4s\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
- "ldr s24, [x24, x15]\n"
- "ldr s23, [x23, x15]\n"
- "fmax v18.4s, v25.4s, v24.4s\n"
- "fmax v17.4s, v27.4s, v23.4s\n"
- "ldr s22, [x21, x15]\n"
- "fmax v16.4s, v24.4s, v22.4s\n"
+ "ldr s16, [x22, x15]\n"
+ "ldr s22, [x26, x15]\n"
+ "fmax v21.4s, v17.4s, v16.4s\n"
+ "ldr s16, [x9, x15]\n"
+ "ldr s17, [x27, x15]\n"
+ "fmax v16.4s, v22.4s, v16.4s\n"
+ "fmax v20.4s, v23.4s, v16.4s\n"
+ "ldr s19, [x24, x15]\n"
+ "ldr s16, [x23, x15]\n"
+ "fmax v18.4s, v17.4s, v19.4s\n"
+ "fmax v17.4s, v22.4s, v16.4s\n"
+ "ldr s16, [x21, x15]\n"
+ "fmax v16.4s, v19.4s, v16.4s\n"
"add x15, x15, #0x4\n"
- "fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v20.4s\n"
- "fmax v16.4s, v16.4s, v20.4s\n"
- "str s19, [x14, x12]\n"
+ "fmax v18.4s, v18.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v21.4s, v16.4s\n"
+ "str s20, [x14, x12]\n"
"str s18, [x13, x12]\n"
"str s17, [x11, x12]\n"
"str s16, [x10, x12]\n"
@@ -172,4 +173,5 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 4f1af09e08..f4706635dc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -41,10 +41,10 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x10\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xff800000\n"
@@ -53,66 +53,66 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"dup v7.4s, w20\n"
"dup v6.4s, w20\n"
"dup v5.4s, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fmax v23.4s, v4.4s, v3.4s\n"
"fmax v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"fmax v22.4s, v2.4s, v1.4s\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"fmax v18.4s, v27.4s, v21.4s\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"fmax v21.4s, v0.4s, v31.4s\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"fmax v17.4s, v26.4s, v20.4s\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"fmax v20.4s, v30.4s, v29.4s\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"fmax v16.4s, v25.4s, v24.4s\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"fmax v19.4s, v23.4s, v19.4s\n"
"fmax v18.4s, v22.4s, v18.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fmax v17.4s, v21.4s, v17.4s\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"fmax v8.4s, v8.4s, v19.4s\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"fmax v7.4s, v7.4s, v18.4s\n"
"fmax v6.4s, v6.4s, v17.4s\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"fmax v5.4s, v5.4s, v16.4s\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fmax v23.4s, v4.4s, v3.4s\n"
@@ -135,28 +135,28 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v4.4s\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "fmax v7.4s, v7.4s, v2.4s\n"
- "fmax v6.4s, v6.4s, v0.4s\n"
- "ldr q30, [x24, x26]\n"
- "fmax v5.4s, v5.4s, v30.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "fmax v6.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x20, x23]\n"
+ "fmax v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
@@ -166,110 +166,110 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.4s, w20\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v4.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x4\n"
"cmp %x[n_channels], #0x4\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
"dup v8.4s, w20\n"
- "add %x[outptr], %x[outptr], x9\n"
- "mov x20, %x[inptrs]\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
"subs x25, x25, #0x1\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
"subs x21, x21, #0x1\n"
"fmax v8.4s, v8.4s, v4.4s\n"
@@ -287,10 +287,11 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"25:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 5a7e5f981b..5d082102b3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -105,7 +105,7 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -121,42 +121,42 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"saddl v23.8h, v31.8b, v30.8b\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"saddl v21.8h, v29.8b, v28.8b\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v19.8h, v27.8b, v26.8b\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "saddl v17.8h, v25.8b, v24.8b\n"
+ "saddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddl v17.8h, v25.8b, v24.8b\n"
- "saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -196,23 +196,23 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddw v1.4s, v1.4s, v16.4h\n"
"saddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "sxtl v21.8h, v29.8b\n"
- "sxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "sxtl v19.8h, v27.8b\n"
- "sxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "sxtl v17.8h, v25.8b\n"
- "sxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v23.8h, v16.8b\n"
+ "sxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "sxtl v21.8h, v16.8b\n"
+ "sxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "sxtl v19.8h, v17.8b\n"
+ "sxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
"saddw v13.4s, v13.4s, v22.4h\n"
@@ -330,49 +330,49 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -397,9 +397,9 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -411,142 +411,142 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "sxtl v17.8h, v31.8b\n"
+ "sxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -569,9 +569,9 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -626,4 +626,5 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 234b4442c8..f8f1134866 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index bd14408c74..7e62ac1afc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,11 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -111,7 +112,7 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"smax v18.16b, v18.16b, v21.16b\n"
"smax v17.16b, v17.16b, v20.16b\n"
"add x15, x15, #0x10\n"
- "smax v16.16b, v16.16b, v20.16b\n"
+ "smax v16.16b, v20.16b, v16.16b\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"smax v21.16b, v30.16b, v29.16b\n"
"smax v20.16b, v29.16b, v28.16b\n"
- "smax v19.16b, v27.16b, v26.16b\n"
+ "smax v16.16b, v27.16b, v26.16b\n"
"smax v18.16b, v25.16b, v24.16b\n"
"smax v17.16b, v27.16b, v23.16b\n"
- "smax v16.16b, v24.16b, v22.16b\n"
- "smax v19.16b, v21.16b, v19.16b\n"
+ "smax v19.16b, v24.16b, v22.16b\n"
+ "smax v16.16b, v21.16b, v16.16b\n"
"smax v18.16b, v18.16b, v21.16b\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"smax v17.16b, v17.16b, v20.16b\n"
- "smax v16.16b, v16.16b, v20.16b\n"
+ "smax v16.16b, v20.16b, v19.16b\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr b30, [x28, x15]\n"
- "ldr b29, [x25, x15]\n"
- "smax v21.16b, v30.16b, v29.16b\n"
+ "ldr b16, [x28, x15]\n"
+ "ldr b17, [x25, x15]\n"
+ "smax v23.16b, v16.16b, v17.16b\n"
"subs x16, x16, #0x1\n"
- "ldr b28, [x22, x15]\n"
- "ldr b27, [x26, x15]\n"
- "smax v20.16b, v29.16b, v28.16b\n"
- "ldr b26, [x9, x15]\n"
- "ldr b25, [x27, x15]\n"
- "smax v19.16b, v27.16b, v26.16b\n"
- "smax v19.16b, v21.16b, v19.16b\n"
- "ldr b24, [x24, x15]\n"
- "ldr b23, [x23, x15]\n"
- "smax v18.16b, v25.16b, v24.16b\n"
- "smax v17.16b, v27.16b, v23.16b\n"
- "ldr b22, [x21, x15]\n"
- "smax v16.16b, v24.16b, v22.16b\n"
+ "ldr b16, [x22, x15]\n"
+ "ldr b22, [x26, x15]\n"
+ "smax v21.16b, v17.16b, v16.16b\n"
+ "ldr b16, [x9, x15]\n"
+ "ldr b17, [x27, x15]\n"
+ "smax v16.16b, v22.16b, v16.16b\n"
+ "smax v20.16b, v23.16b, v16.16b\n"
+ "ldr b19, [x24, x15]\n"
+ "ldr b16, [x23, x15]\n"
+ "smax v18.16b, v17.16b, v19.16b\n"
+ "smax v17.16b, v22.16b, v16.16b\n"
+ "ldr b16, [x21, x15]\n"
+ "smax v16.16b, v19.16b, v16.16b\n"
"add x15, x15, #0x1\n"
- "smax v18.16b, v18.16b, v21.16b\n"
- "smax v17.16b, v17.16b, v20.16b\n"
- "smax v16.16b, v16.16b, v20.16b\n"
- "str b19, [x14, x12]\n"
+ "smax v18.16b, v18.16b, v23.16b\n"
+ "smax v17.16b, v17.16b, v21.16b\n"
+ "smax v16.16b, v21.16b, v16.16b\n"
+ "str b20, [x14, x12]\n"
"str b18, [x13, x12]\n"
"str b17, [x11, x12]\n"
"str b16, [x10, x12]\n"
@@ -172,4 +173,5 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
index 6168a57ca4..411fd11460 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -41,77 +41,77 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
"movi v7.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
"movi v5.16b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"smax v23.16b, v4.16b, v3.16b\n"
"smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"smax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"smax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"smax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"smax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"smax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"smax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"smax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"smax v7.16b, v7.16b, v18.16b\n"
"smax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"smax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"smax v23.16b, v4.16b, v3.16b\n"
@@ -134,28 +134,28 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "smax v7.16b, v7.16b, v2.16b\n"
- "smax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "smax v5.16b, v5.16b, v30.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -164,217 +164,217 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"smax v8.16b, v8.16b, v4.16b\n"
@@ -428,10 +428,11 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index e889782fa3..019f402911 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,8 +22,6 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
@@ -31,6 +29,8 @@
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -124,7 +124,7 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -140,42 +140,42 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"saddl v23.8h, v31.8b, v30.8b\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"saddl v21.8h, v29.8b, v28.8b\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v19.8h, v27.8b, v26.8b\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "saddl v17.8h, v25.8b, v24.8b\n"
+ "saddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddl v17.8h, v25.8b, v24.8b\n"
- "saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -215,23 +215,23 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddw v1.4s, v1.4s, v16.4h\n"
"saddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "sxtl v21.8h, v29.8b\n"
- "sxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "sxtl v19.8h, v27.8b\n"
- "sxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "sxtl v17.8h, v25.8b\n"
- "sxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v23.8h, v16.8b\n"
+ "sxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "sxtl v21.8h, v16.8b\n"
+ "sxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "sxtl v19.8h, v17.8b\n"
+ "sxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
"saddw v13.4s, v13.4s, v22.4h\n"
@@ -366,49 +366,49 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v18.4s }, [%x[left_shift]]\n"
@@ -438,9 +438,9 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -452,142 +452,142 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "sxtl v17.8h, v31.8b\n"
+ "sxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v18.4s }, [%x[left_shift]]\n"
@@ -615,9 +615,9 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v17.4s\n"
"smin v13.4s, v13.4s, v17.4s\n"
"smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -672,4 +672,5 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 90a31ec677..f7b8dc761c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -21,12 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -42,77 +43,77 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
"movi v7.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
"movi v5.16b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"smax v23.16b, v4.16b, v3.16b\n"
"smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"smax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"smax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"smax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"smax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"smax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"smax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"smax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"smax v7.16b, v7.16b, v18.16b\n"
"smax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"smax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"smax v23.16b, v4.16b, v3.16b\n"
@@ -135,16 +136,16 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "smax v7.16b, v7.16b, v2.16b\n"
- "smax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "smax v5.16b, v5.16b, v30.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sxtl v23.8h, v8.8b\n"
@@ -271,16 +272,16 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"uzp1 v19.16b, v24.16b, v19.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
"uzp1 v16.16b, v20.16b, v19.16b\n"
- "str q18, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q17, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
- "str q16, [%x[outptr], x26]\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q17, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q16, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -289,296 +290,296 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "sxtl2 v22.8h, v8.16b\n"
+ "sxtl v17.8h, v8.8b\n"
+ "sxtl2 v16.8h, v8.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v4.4s }, [x20]\n"
- "sxtl v1.4s, v23.4h\n"
- "sxtl2 v23.4s, v23.8h\n"
+ "ld1r { v22.4s }, [x20]\n"
+ "sxtl v21.4s, v17.4h\n"
+ "sxtl2 v20.4s, v17.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v3.4s }, [x20]\n"
- "sxtl v0.4s, v22.4h\n"
- "sxtl2 v31.4s, v22.8h\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v2.4s }, [x20]\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v4.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "srshl v0.4s, v0.4s, v4.4s\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v1.4s, v1.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
"movi v17.4s, #0x7f\n"
- "srshl v1.4s, v1.4s, v2.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v2.4s\n"
- "srshl v31.4s, v31.4s, v2.4s\n"
+ "srshl v21.4s, v21.4s, v16.4s\n"
+ "srshl v20.4s, v20.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v23.4s, v23.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
- "smin v31.4s, v31.4s, v17.4s\n"
- "uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
+ "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"smax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "sxtl2 v22.8h, v8.16b\n"
+ "sxtl v17.8h, v8.8b\n"
+ "sxtl2 v16.8h, v8.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v4.4s }, [x20]\n"
- "sxtl v1.4s, v23.4h\n"
- "sxtl2 v23.4s, v23.8h\n"
+ "ld1r { v22.4s }, [x20]\n"
+ "sxtl v21.4s, v17.4h\n"
+ "sxtl2 v20.4s, v17.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v3.4s }, [x20]\n"
- "sxtl v0.4s, v22.4h\n"
- "sxtl2 v31.4s, v22.8h\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v2.4s }, [x20]\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v4.4s\n"
- "srshl v0.4s, v0.4s, v4.4s\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v1.4s, v1.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
"movi v17.4s, #0x7f\n"
- "srshl v1.4s, v1.4s, v2.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v2.4s\n"
- "srshl v31.4s, v31.4s, v2.4s\n"
+ "srshl v21.4s, v21.4s, v16.4s\n"
+ "srshl v20.4s, v20.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v23.4s, v23.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
- "smin v31.4s, v31.4s, v17.4s\n"
- "uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
+ "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -627,10 +628,11 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 76828a911e..f8984c451c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -105,7 +105,7 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -121,42 +121,42 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "uaddl v17.8h, v25.8b, v24.8b\n"
+ "uaddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddl v17.8h, v25.8b, v24.8b\n"
- "uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -196,23 +196,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -330,49 +330,49 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -397,9 +397,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -411,142 +411,142 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -569,9 +569,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -626,4 +626,5 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 556d833681..9d160bf8f8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<u
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 149566197a..66cdb7f849 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -22,11 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -111,7 +112,7 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"umax v18.16b, v18.16b, v21.16b\n"
"umax v17.16b, v17.16b, v20.16b\n"
"add x15, x15, #0x10\n"
- "umax v16.16b, v16.16b, v20.16b\n"
+ "umax v16.16b, v20.16b, v16.16b\n"
"str q19, [x14, x12]\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
@@ -121,43 +122,43 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"2:" // Vector: Tail
"umax v21.16b, v30.16b, v29.16b\n"
"umax v20.16b, v29.16b, v28.16b\n"
- "umax v19.16b, v27.16b, v26.16b\n"
+ "umax v16.16b, v27.16b, v26.16b\n"
"umax v18.16b, v25.16b, v24.16b\n"
"umax v17.16b, v27.16b, v23.16b\n"
- "umax v16.16b, v24.16b, v22.16b\n"
- "umax v19.16b, v21.16b, v19.16b\n"
+ "umax v19.16b, v24.16b, v22.16b\n"
+ "umax v16.16b, v21.16b, v16.16b\n"
"umax v18.16b, v18.16b, v21.16b\n"
- "str q19, [x14, x12]\n"
+ "str q16, [x14, x12]\n"
"umax v17.16b, v17.16b, v20.16b\n"
- "umax v16.16b, v16.16b, v20.16b\n"
+ "umax v16.16b, v20.16b, v19.16b\n"
"str q18, [x13, x12]\n"
"str q17, [x11, x12]\n"
"str q16, [x10, x12]\n"
"add x12, x12, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
- "ldr b30, [x28, x15]\n"
- "ldr b29, [x25, x15]\n"
- "umax v21.16b, v30.16b, v29.16b\n"
+ "ldr b16, [x28, x15]\n"
+ "ldr b17, [x25, x15]\n"
+ "umax v23.16b, v16.16b, v17.16b\n"
"subs x16, x16, #0x1\n"
- "ldr b28, [x22, x15]\n"
- "ldr b27, [x26, x15]\n"
- "umax v20.16b, v29.16b, v28.16b\n"
- "ldr b26, [x9, x15]\n"
- "ldr b25, [x27, x15]\n"
- "umax v19.16b, v27.16b, v26.16b\n"
- "umax v19.16b, v21.16b, v19.16b\n"
- "ldr b24, [x24, x15]\n"
- "ldr b23, [x23, x15]\n"
- "umax v18.16b, v25.16b, v24.16b\n"
- "umax v17.16b, v27.16b, v23.16b\n"
- "ldr b22, [x21, x15]\n"
- "umax v16.16b, v24.16b, v22.16b\n"
+ "ldr b16, [x22, x15]\n"
+ "ldr b22, [x26, x15]\n"
+ "umax v21.16b, v17.16b, v16.16b\n"
+ "ldr b16, [x9, x15]\n"
+ "ldr b17, [x27, x15]\n"
+ "umax v16.16b, v22.16b, v16.16b\n"
+ "umax v20.16b, v23.16b, v16.16b\n"
+ "ldr b19, [x24, x15]\n"
+ "ldr b16, [x23, x15]\n"
+ "umax v18.16b, v17.16b, v19.16b\n"
+ "umax v17.16b, v22.16b, v16.16b\n"
+ "ldr b16, [x21, x15]\n"
+ "umax v16.16b, v19.16b, v16.16b\n"
"add x15, x15, #0x1\n"
- "umax v18.16b, v18.16b, v21.16b\n"
- "umax v17.16b, v17.16b, v20.16b\n"
- "umax v16.16b, v16.16b, v20.16b\n"
- "str b19, [x14, x12]\n"
+ "umax v18.16b, v18.16b, v23.16b\n"
+ "umax v17.16b, v17.16b, v21.16b\n"
+ "umax v16.16b, v21.16b, v16.16b\n"
+ "str b20, [x14, x12]\n"
"str b18, [x13, x12]\n"
"str b17, [x11, x12]\n"
"str b16, [x10, x12]\n"
@@ -172,4 +173,5 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
index 98f5b8351c..2ceef125ca 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,11 +22,11 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -41,77 +41,77 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"umax v23.16b, v4.16b, v3.16b\n"
"umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"umax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"umax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"umax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"umax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"umax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"umax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"umax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"umax v7.16b, v7.16b, v18.16b\n"
"umax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"umax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"umax v23.16b, v4.16b, v3.16b\n"
@@ -134,28 +134,28 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "umax v7.16b, v7.16b, v2.16b\n"
- "umax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "umax v5.16b, v5.16b, v30.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "str q8, [%x[outptr], x9]\n"
- "str q7, [%x[outptr], x28]\n"
- "add x9, x9, #0x40\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
"add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -164,217 +164,217 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"umax v8.16b, v8.16b, v4.16b\n"
@@ -428,10 +428,11 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 19227d8aaa..31a3489e5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,8 +22,6 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
@@ -31,6 +29,8 @@
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -132,7 +132,7 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
"mov v11.16b, v15.16b\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov v10.16b, v15.16b\n"
"mov v9.16b, v15.16b\n"
"mov v8.16b, v15.16b\n"
@@ -145,42 +145,42 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v1.16b, v15.16b\n"
"mov v0.16b, v15.16b\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "uaddl v17.8h, v25.8b, v24.8b\n"
+ "uaddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddl v17.8h, v25.8b, v24.8b\n"
- "uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -220,23 +220,23 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -391,56 +391,56 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1r { v19.4s }, [%x[left_shift]]\n"
+ "ld1r { v16.4s }, [%x[left_shift]]\n"
"ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v19.4s\n"
- "srshl v12.4s, v12.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
"sqrdmulh v15.4s, v15.4s, v18.4s\n"
@@ -467,9 +467,9 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -481,151 +481,151 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "ld1r { v19.4s }, [%x[left_shift]]\n"
+ "ld1r { v16.4s }, [%x[left_shift]]\n"
"ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v19.4s\n"
- "srshl v12.4s, v12.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
"sqrdmulh v15.4s, v15.4s, v18.4s\n"
@@ -650,9 +650,9 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -707,4 +707,5 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
index 7eea14f70f..f4927c5536 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,12 +22,12 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include "pooling.hpp"
#include <cstdint>
#include <cstddef>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -43,77 +43,77 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x9, #0x0\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "mov x27, #0x20\n" // cntb _, ALL, #2
- "mov x26, #0x30\n" // cntb _, ALL, #3
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldr q2, [x24, x28]\n"
- "ldr q1, [x23, x28]\n"
- "ldr q0, [x24, x27]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x24, x26]\n"
- "ldr q29, [x23, x26]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"umax v23.16b, v4.16b, v3.16b\n"
"umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
"umax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x24, x28]\n"
+ "ldr q2, [x21, x26]\n"
"umax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x23, x28]\n"
+ "ldr q1, [x20, x26]\n"
"umax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x24, x27]\n"
+ "ldr q0, [x21, x24]\n"
"umax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x23, x27]\n"
+ "ldr q31, [x20, x24]\n"
"umax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x24, x26]\n"
+ "ldr q30, [x21, x23]\n"
"umax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x23, x26]\n"
+ "ldr q29, [x20, x23]\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x22, x28]\n"
- "ldr q21, [x21, x28]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
"subs x25, x25, #0x1\n"
"umax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x22, x27]\n"
- "ldr q20, [x21, x27]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"umax v7.16b, v7.16b, v18.16b\n"
"umax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x22, x26]\n"
- "ldr q24, [x21, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"umax v5.16b, v5.16b, v16.16b\n"
- "add x20, x20, #0x20\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"umax v23.16b, v4.16b, v3.16b\n"
@@ -136,16 +136,16 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
- "ldr q2, [x24, x28]\n"
- "ldr q0, [x24, x27]\n"
- "umax v7.16b, v7.16b, v2.16b\n"
- "umax v6.16b, v6.16b, v0.16b\n"
- "ldr q30, [x24, x26]\n"
- "umax v5.16b, v5.16b, v30.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
@@ -292,17 +292,17 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"uzp1 v19.16b, v25.16b, v19.16b\n"
"uzp1 v18.16b, v24.16b, v18.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x40\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"uzp1 v16.16b, v22.16b, v21.16b\n"
"uzp1 v17.16b, v20.16b, v17.16b\n"
- "str q16, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
- "str q17, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"str q16, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q16, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -311,314 +311,314 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x23, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldr q4, [x24, x9]\n"
- "ldr q3, [x23, x9]\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x22, x9]\n"
- "ldr q22, [x21, x9]\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "add x20, x20, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ldr q4, [x24, x9]\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v4.4s }, [x20]\n"
- "uxtl v23.8h, v8.8b\n"
- "uxtl2 v24.8h, v8.16b\n"
- "neg v4.4s, v4.4s\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "uxtl v17.8h, v8.8b\n"
+ "uxtl2 v16.8h, v8.16b\n"
+ "neg v18.4s, v18.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
- "saddw v0.4s, v4.4s, v23.4h\n"
- "saddw2 v23.4s, v4.4s, v23.8h\n"
- "saddw v31.4s, v4.4s, v24.4h\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "saddw v22.4s, v18.4s, v17.4h\n"
+ "saddw2 v21.4s, v18.4s, v17.8h\n"
+ "saddw v20.4s, v18.4s, v16.4h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v2.4s }, [x20]\n"
- "saddw2 v30.4s, v4.4s, v24.8h\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "saddw2 v18.4s, v18.4s, v16.8h\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v2.4s\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
- "sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqrdmulh v30.4s, v30.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v1.4s\n"
- "srshl v23.4s, v23.4s, v1.4s\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "srshl v30.4s, v30.4s, v1.4s\n"
- "add v0.4s, v0.4s, v16.4s\n"
- "add v23.4s, v23.4s, v16.4s\n"
- "add v31.4s, v31.4s, v16.4s\n"
- "add v30.4s, v30.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v17.4s\n"
+ "srshl v21.4s, v21.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v17.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0x0\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0xff\n"
- "smin v0.4s, v0.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v16.4s\n"
- "smin v31.4s, v31.4s, v16.4s\n"
- "smin v30.4s, v30.4s, v16.4s\n"
- "uzp1 v23.16b, v0.16b, v23.16b\n"
- "uzp1 v16.16b, v31.16b, v30.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x9]\n"
- "add x9, x9, #0x10\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v17.16b, v22.16b, v21.16b\n"
+ "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x9\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "add x24, x24, x9\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x21, x21, x9\n"
+ "add x20, x20, x27\n"
"movi v28.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x24], #0x8\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d28, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v28.h }[6], [x22], #0x2\n"
- "ld1 { v22.h }[6], [x21], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v28.b }[14], [x22], #0x1\n"
- "ld1 { v22.b }[14], [x21], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v28.b }[12], [x22], #0x1\n"
- "ld1 { v22.b }[12], [x21], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v28.h }[4], [x22], #0x2\n"
- "ld1 { v22.h }[4], [x21], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v28.b }[10], [x22], #0x1\n"
- "ld1 { v22.b }[10], [x21], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v28.b }[8], [x22], #0x1\n"
- "ld1 { v22.b }[8], [x21], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s28, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v28.b }[6], [x22], #0x1\n"
- "ld1 { v22.b }[6], [x21], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v28.b }[4], [x22], #0x1\n"
- "ld1 { v22.b }[4], [x21], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h28, [x22], #0x2\n"
- "ldr h22, [x21], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v28.b }[2], [x22], #0x1\n"
- "ld1 { v22.b }[2], [x21], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x24], #0x1\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b28, [x22], #0x1\n"
- "ldr b22, [x21], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
"subs x25, x25, #0x1\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x24, [x20], #0x8\n"
- "add x24, x24, x9\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
"movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x24], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x24], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x24], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x24], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x24], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x24], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x24], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x24], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x24], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x24], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x24], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x24], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
"umax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v4.4s }, [x20]\n"
- "uxtl v23.8h, v8.8b\n"
- "uxtl2 v24.8h, v8.16b\n"
- "neg v4.4s, v4.4s\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "uxtl v17.8h, v8.8b\n"
+ "uxtl2 v16.8h, v8.16b\n"
+ "neg v18.4s, v18.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
- "saddw v0.4s, v4.4s, v23.4h\n"
- "saddw2 v23.4s, v4.4s, v23.8h\n"
- "saddw v31.4s, v4.4s, v24.4h\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "saddw v22.4s, v18.4s, v17.4h\n"
+ "saddw2 v21.4s, v18.4s, v17.8h\n"
+ "saddw v20.4s, v18.4s, v16.4h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v2.4s }, [x20]\n"
- "saddw2 v30.4s, v4.4s, v24.8h\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "saddw2 v18.4s, v18.4s, v16.8h\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1r { v16.4s }, [x20]\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v2.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
- "sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqrdmulh v30.4s, v30.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v1.4s\n"
- "srshl v23.4s, v23.4s, v1.4s\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "srshl v30.4s, v30.4s, v1.4s\n"
- "add v0.4s, v0.4s, v16.4s\n"
- "add v23.4s, v23.4s, v16.4s\n"
- "add v31.4s, v31.4s, v16.4s\n"
- "add v30.4s, v30.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v17.4s\n"
+ "srshl v21.4s, v21.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v17.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0x0\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
"movi v16.4s, #0xff\n"
- "smin v0.4s, v0.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v16.4s\n"
- "smin v31.4s, v31.4s, v16.4s\n"
- "smin v30.4s, v30.4s, v16.4s\n"
- "uzp1 v23.16b, v0.16b, v23.16b\n"
- "uzp1 v16.16b, v31.16b, v30.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v17.16b, v22.16b, v21.16b\n"
+ "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -667,10 +667,11 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__aarch64__)
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
index 2bb22131f7..1f8f863de2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,10 @@
#include <cstdint>
#include <cstring>
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#include "bfloat.hpp"
+using arm_gemm::bfloat16;
+#endif
namespace arm_conv {
namespace pooling {
@@ -41,9 +45,15 @@ void cpp_nhwc_1x1_stride_any_depthfirst_impl(
}
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const float *const *, float *);
-#if defined(__ARM_FP16_ARGS)
+
+#ifdef __ARM_FP16_ARGS
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const __fp16 *const *, __fp16 *);
-#endif // defined(__ARM_FP16_ARGS)
+#endif
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const bfloat16 *const *, bfloat16 *);
+#endif
+
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const int8_t *const *, int8_t *);
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const uint8_t *const *, uint8_t *);
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 250d92c051..f6682e75e2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index bce623acd1..67b07205cd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -91,34 +91,34 @@ void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x20, %x[args], %[offsetof_rescale]\n"
"ld1rqh { z4.h }, p0/Z, [x20]\n"
"ldr x5, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.h, x3, x5\n"
+ "whilelt p0.h, x3, x5\n"
"mov x6, #0x0\n"
"ldp x7, x8, [x21, #0x0]\n"
"ldp x17, x16, [x21, #0x10]\n"
"ldp x15, x14, [x4, #0x0]\n"
- "ld1h { z3.h }, p1/Z, [x14, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x14, x3, LSL #1]\n"
"ldp x13, x12, [x4, #0x10]\n"
- "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
"ldp x11, x10, [x4, #0x20]\n"
- "ld1h { z1.h }, p1/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x10, x3, LSL #1]\n"
"ldp x9, x28, [x4, #0x30]\n"
- "ld1h { z0.h }, p1/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z0.h }, p0/Z, [x9, x3, LSL #1]\n"
"ldp x27, x26, [x4, #0x40]\n"
- "ld1h { z31.h }, p1/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x26, x3, LSL #1]\n"
"ldp x25, x24, [x4, #0x50]\n"
- "ld1h { z30.h }, p1/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x25, x3, LSL #1]\n"
"ldp x23, x22, [x4, #0x60]\n"
- "ld1h { z29.h }, p1/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x11, x3, LSL #1]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1h { z28.h }, p1/Z, [x27, x3, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x28, x3, LSL #1]\n"
- "ld1h { z22.h }, p1/Z, [x24, x3, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x22, x3, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x21, x3, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x15, x3, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z22.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z21.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x15, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
"whilelt p1.h, x3, x5\n"
"b.none 2f\n"
@@ -206,4 +206,4 @@ void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
index 117eb36007..cf09f421c4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index c43da42d9e..60f17b7bc2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -57,68 +58,68 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z3.b, #0x0\n"
"mov z2.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x21, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.h, z1.h, z0.h\n"
"fadd z19.h, z31.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fadd z22.h, z29.h, z22.h\n"
"fadd z18.h, z28.h, z18.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
"fadd z18.h, z22.h, z18.h\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
"fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"fadd z5.h, z5.h, z19.h\n"
"fadd z4.h, z4.h, z18.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
"fadd z3.h, z3.h, z17.h\n"
"fadd z2.h, z2.h, z16.h\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x21, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.h, z1.h, z0.h\n"
@@ -141,16 +142,16 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.h, z5.h, z1.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x28, LSL #1]\n"
- "fadd z4.h, z4.h, z29.h\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "fadd z3.h, z3.h, z27.h\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "fadd z2.h, z2.h, z25.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fadd z3.h, z3.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z2.h, z2.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z5.h, z5.h, z6.h\n"
@@ -173,44 +174,44 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.h, z1.h, z0.h\n"
- "fadd z19.h, z31.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fadd z19.h, z23.h, z19.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z5.h, z5.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.h, z1.h, z0.h\n"
- "fadd z19.h, z31.h, z30.h\n"
- "fadd z19.h, z23.h, z19.h\n"
- "fadd z5.h, z5.h, z19.h\n"
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z1.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.h, z5.h, z1.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z5.h, z5.h, z6.h\n"
@@ -229,4 +230,4 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9489c1f8da..cd6c7449a8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index f71f2625b6..7fc776ed4e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.h, x15, x13\n"
+ "whilelt p0.h, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x22, x15, LSL #1]\n"
- "ld1h { z19.h }, p1/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z19.h }, p0/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x15, LSL #1]\n"
"incw x15\n"
"whilelt p1.h, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
"ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
"whilelt p0.h, x14, x13\n"
- "movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
- "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
+ "movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
+ "movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
"ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
- "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z23.h\n"
- "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
"ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
"ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
- "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z20.h\n"
- "fmax z18.h, p2/M, z18.h, z22.h\n"
+ "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+ "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
"ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
- "fmax z17.h, p2/M, z17.h, z21.h\n"
- "fmax z16.h, p2/M, z16.h, z21.h\n"
+ "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
"ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
"st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
"ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
"st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z19.h }, p1/Z, [x22, x15, LSL #1]\n"
"st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
- "ld1h { z19.h }, p1/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z23.h }, p1/Z, [x20, x15, LSL #1]\n"
"incw x15\n"
"whilelt p1.h, x15, x13\n"
"st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
@@ -125,13 +125,13 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.h, x14, x13\n"
"movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
"movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
- "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z23.h\n"
- "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z19.h\n"
- "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z20.h\n"
+ "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z19.h\n"
+ "movprfx z19, z24\n fmax z19.h, p2/M, z19.h, z23.h\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
"fmax z18.h, p2/M, z18.h, z22.h\n"
- "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
+ "st1h { z16.h }, p0, [x12, x14, LSL #1]\n"
"fmax z17.h, p2/M, z17.h, z21.h\n"
- "fmax z16.h, p2/M, z16.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z19.h\n"
"st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
"st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
@@ -145,4 +145,4 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
index 33ff1f2154..bfb3bf5b1a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
index c07ce97231..afa2ccbd71 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.h, #0xfc00\n"
"mov z3.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.h, #0xfc00\n"
"mov z1.h, #0xfc00\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z18.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z29.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z16.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
"fmax z23.h, p0/M, z23.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fmax z18.h, p0/M, z18.h, z29.h\n"
"fmax z22.h, p0/M, z22.h, z28.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fmax z17.h, p0/M, z17.h, z27.h\n"
"fmax z21.h, p0/M, z21.h, z26.h\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
"fmax z16.h, p0/M, z16.h, z25.h\n"
"fmax z20.h, p0/M, z20.h, z24.h\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
"fmax z19.h, p0/M, z19.h, z23.h\n"
"fmax z18.h, p0/M, z18.h, z22.h\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
"fmax z17.h, p0/M, z17.h, z21.h\n"
"fmax z16.h, p0/M, z16.h, z20.h\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
"fmax z4.h, p0/M, z4.h, z19.h\n"
"fmax z3.h, p0/M, z3.h, z18.h\n"
- "ld1h { z18.h }, p3/Z, [x24, x28, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
"fmax z2.h, p0/M, z2.h, z17.h\n"
"fmax z1.h, p0/M, z1.h, z16.h\n"
- "ld1h { z29.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z16.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
@@ -138,15 +139,15 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.h, p0/M, z4.h, z0.h\n"
- "ld1h { z18.h }, p3/Z, [x24, x28, LSL #1]\n"
- "fmax z3.h, p0/M, z3.h, z18.h\n"
- "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmax z2.h, p0/M, z2.h, z17.h\n"
- "ld1h { z16.h }, p1/Z, [x24, x26, LSL #1]\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "fmax z3.h, p0/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z2.h, p0/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
"fmax z1.h, p0/M, z1.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
- "fmax z23.h, p0/M, z23.h, z30.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z4.h, p0/M, z4.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z31.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z23.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
- "fmax z23.h, p0/M, z23.h, z30.h\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "fmax z4.h, p0/M, z4.h, z19.h\n"
+ "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z0.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.h, p0/M, z4.h, z0.h\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
@@ -221,4 +222,4 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index fa1b441371..23a0eee04e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index cf69800522..8c8532827a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -91,34 +91,34 @@ void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"add x20, %x[args], %[offsetof_rescale]\n"
"ld1rqw { z4.s }, p0/Z, [x20]\n"
"ldr x5, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.s, x3, x5\n"
+ "whilelt p0.s, x3, x5\n"
"mov x6, #0x0\n"
"ldp x7, x8, [x21, #0x0]\n"
"ldp x17, x16, [x21, #0x10]\n"
"ldp x15, x14, [x4, #0x0]\n"
- "ld1w { z3.s }, p1/Z, [x14, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x14, x3, LSL #2]\n"
"ldp x13, x12, [x4, #0x10]\n"
- "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
"ldp x11, x10, [x4, #0x20]\n"
- "ld1w { z1.s }, p1/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x10, x3, LSL #2]\n"
"ldp x9, x28, [x4, #0x30]\n"
- "ld1w { z0.s }, p1/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z0.s }, p0/Z, [x9, x3, LSL #2]\n"
"ldp x27, x26, [x4, #0x40]\n"
- "ld1w { z31.s }, p1/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x26, x3, LSL #2]\n"
"ldp x25, x24, [x4, #0x50]\n"
- "ld1w { z30.s }, p1/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x25, x3, LSL #2]\n"
"ldp x23, x22, [x4, #0x60]\n"
- "ld1w { z29.s }, p1/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x11, x3, LSL #2]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1w { z28.s }, p1/Z, [x27, x3, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x28, x3, LSL #2]\n"
- "ld1w { z22.s }, p1/Z, [x24, x3, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x22, x3, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x21, x3, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x15, x3, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x15, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
"whilelt p1.s, x3, x5\n"
"b.none 2f\n"
@@ -206,4 +206,4 @@ void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
index 814c89ca23..29bcfc5a3b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 03ab9c0a9e..86e7f84542 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -57,68 +58,68 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z3.b, #0x0\n"
"mov z2.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.s, z1.s, z0.s\n"
"fadd z19.s, z31.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fadd z22.s, z29.s, z22.s\n"
"fadd z18.s, z28.s, z18.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
"fadd z18.s, z22.s, z18.s\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
"fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"fadd z5.s, z5.s, z19.s\n"
"fadd z4.s, z4.s, z18.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
"fadd z3.s, z3.s, z17.s\n"
"fadd z2.s, z2.s, z16.s\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.s, z1.s, z0.s\n"
@@ -141,16 +142,16 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.s, z5.s, z1.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x28, LSL #2]\n"
- "fadd z4.s, z4.s, z29.s\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "fadd z3.s, z3.s, z27.s\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "fadd z2.s, z2.s, z25.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fadd z3.s, z3.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z2.s, z2.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z5.s, z5.s, z6.s\n"
@@ -173,44 +174,44 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.s, z1.s, z0.s\n"
- "fadd z19.s, z31.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fadd z19.s, z23.s, z19.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z5.s, z5.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.s, z1.s, z0.s\n"
- "fadd z19.s, z31.s, z30.s\n"
- "fadd z19.s, z23.s, z19.s\n"
- "fadd z5.s, z5.s, z19.s\n"
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z1.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.s, z5.s, z1.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z5.s, z5.s, z6.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 4e3cd6e228..338348231f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 05edac6623..3c7213a498 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.s, x15, x13\n"
+ "whilelt p0.s, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x22, x15, LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x15, LSL #2]\n"
"incw x15\n"
"whilelt p1.s, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
"ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
"whilelt p0.s, x14, x13\n"
- "movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
- "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
+ "movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
+ "movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
"ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
- "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z23.s\n"
- "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
"ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
"ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
- "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z20.s\n"
- "fmax z18.s, p2/M, z18.s, z22.s\n"
+ "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+ "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
"ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
- "fmax z17.s, p2/M, z17.s, z21.s\n"
- "fmax z16.s, p2/M, z16.s, z21.s\n"
+ "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
"ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
"st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
"ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
"st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z19.s }, p1/Z, [x22, x15, LSL #2]\n"
"st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [x20, x15, LSL #2]\n"
"incw x15\n"
"whilelt p1.s, x15, x13\n"
"st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
@@ -125,13 +125,13 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.s, x14, x13\n"
"movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
"movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
- "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z23.s\n"
- "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z19.s\n"
- "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z20.s\n"
+ "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z19.s\n"
+ "movprfx z19, z24\n fmax z19.s, p2/M, z19.s, z23.s\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
"fmax z18.s, p2/M, z18.s, z22.s\n"
- "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
+ "st1w { z16.s }, p0, [x12, x14, LSL #2]\n"
"fmax z17.s, p2/M, z17.s, z21.s\n"
- "fmax z16.s, p2/M, z16.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z19.s\n"
"st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
"st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
@@ -145,4 +145,4 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
index 0c0e445c7a..9bc1f11601 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 14c07724a1..0dabc2f292 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.s, #0xff800000\n"
"mov z3.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.s, #0xff800000\n"
"mov z1.s, #0xff800000\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z18.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z29.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
"fmax z23.s, p0/M, z23.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"fmax z18.s, p0/M, z18.s, z29.s\n"
"fmax z22.s, p0/M, z22.s, z28.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"fmax z17.s, p0/M, z17.s, z27.s\n"
"fmax z21.s, p0/M, z21.s, z26.s\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
"fmax z16.s, p0/M, z16.s, z25.s\n"
"fmax z20.s, p0/M, z20.s, z24.s\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
"fmax z19.s, p0/M, z19.s, z23.s\n"
"fmax z18.s, p0/M, z18.s, z22.s\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
"fmax z17.s, p0/M, z17.s, z21.s\n"
"fmax z16.s, p0/M, z16.s, z20.s\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
"fmax z4.s, p0/M, z4.s, z19.s\n"
"fmax z3.s, p0/M, z3.s, z18.s\n"
- "ld1w { z18.s }, p3/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
"fmax z2.s, p0/M, z2.s, z17.s\n"
"fmax z1.s, p0/M, z1.s, z16.s\n"
- "ld1w { z29.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
@@ -138,15 +139,15 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.s, p0/M, z4.s, z0.s\n"
- "ld1w { z18.s }, p3/Z, [x24, x28, LSL #2]\n"
- "fmax z3.s, p0/M, z3.s, z18.s\n"
- "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmax z2.s, p0/M, z2.s, z17.s\n"
- "ld1w { z16.s }, p1/Z, [x24, x26, LSL #2]\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "fmax z3.s, p0/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z2.s, p0/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
"fmax z1.s, p0/M, z1.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
- "fmax z23.s, p0/M, z23.s, z30.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z4.s, p0/M, z4.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z31.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z23.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
- "fmax z23.s, p0/M, z23.s, z30.s\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "fmax z4.s, p0/M, z4.s, z19.s\n"
+ "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z0.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.s, p0/M, z4.s, z0.s\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
index e383a4c3bd..318510e697 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
index ded1274c13..c24e977dc6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,13 +22,14 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -109,7 +110,7 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,48 +126,48 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
@@ -203,20 +204,20 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -332,74 +333,74 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- "mov z19.s, #0x7f\n"
+ "mov z18.s, #0x7f\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z19.s\n"
+ "not z16.s, p0/M, z18.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
"smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 1613970618..c9a80e6a5b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index e3b9c98d80..96617566a8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.b, x15, x13\n"
+ "whilelt p0.b, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "ld1b { z25.b }, p1/Z, [x26, x15]\n"
- "ld1b { z24.b }, p1/Z, [x23, x15]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+ "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
"ld1b { z30.b }, p1/Z, [x27, x15]\n"
"whilelt p0.b, x14, x13\n"
- "movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
- "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
+ "movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
"ld1b { z28.b }, p1/Z, [x24, x15]\n"
- "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z27.b }, p1/Z, [x21, x15]\n"
"ld1b { z29.b }, p1/Z, [x25, x15]\n"
- "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z20.b\n"
- "smax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
"ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "smax z17.b, p2/M, z17.b, z21.b\n"
- "smax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
"ld1b { z25.b }, p1/Z, [x26, x15]\n"
"st1b { z19.b }, p0, [x12, x14]\n"
"ld1b { z24.b }, p1/Z, [x23, x15]\n"
"st1b { z18.b }, p0, [x11, x14]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
+ "ld1b { z19.b }, p1/Z, [x22, x15]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z23.b }, p1/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -125,13 +125,13 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.b, x14, x13\n"
"movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
"movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
- "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z19.b\n"
- "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z20.b\n"
+ "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z19.b\n"
+ "movprfx z19, z24\n smax z19.b, p2/M, z19.b, z23.b\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
"smax z18.b, p2/M, z18.b, z22.b\n"
- "st1b { z19.b }, p0, [x12, x14]\n"
+ "st1b { z16.b }, p0, [x12, x14]\n"
"smax z17.b, p2/M, z17.b, z21.b\n"
- "smax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z19.b\n"
"st1b { z18.b }, p0, [x11, x14]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -145,4 +145,4 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
index 56aa120cfe..3e0d76c277 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, i
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
index 4e6cad6e92..d2b45cd353 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
"mov z3.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x80\n"
"mov z1.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
"smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"smax z4.b, p0/M, z4.b, z19.b\n"
"smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"smax z2.b, p0/M, z2.b, z17.b\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
@@ -138,15 +139,15 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "smax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z4.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
index ee02c60bc1..c6263f5dbc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index cc58d3e9e2..91f2f7ab31 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,15 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -128,7 +129,7 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -144,48 +145,48 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
@@ -222,20 +223,20 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -368,79 +369,79 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
"ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
"ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
- "mov z19.s, #0x7f\n"
+ "mov z18.s, #0x7f\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z19.s\n"
+ "not z16.s, p0/M, z18.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
"smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
@@ -456,4 +457,4 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
index 050aff397e..9667d37954 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 3850ebf464..e9b586f4ce 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -56,68 +57,68 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
"mov z3.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x80\n"
"mov z1.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
"smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"smax z4.b, p0/M, z4.b, z19.b\n"
"smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"smax z2.b, p0/M, z2.b, z17.b\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
@@ -140,15 +141,15 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "smax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"smax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -292,83 +293,83 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z0.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
- ".inst 0x4508a496 // sshllt z22.h, z4.b, #0x0\n"
+ ".inst 0x4508a091 // sshllb z17.h, z4.b, #0x0\n"
+ ".inst 0x4508a490 // sshllt z16.h, z4.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4510a2e1 // sshllb z1.s, z23.h, #0x0\n"
- ".inst 0x4510a6f7 // sshllt z23.s, z23.h, #0x0\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
+ ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4510a2c0 // sshllb z0.s, z22.h, #0x0\n"
- ".inst 0x4510a6df // sshllt z31.s, z22.h, #0x0\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a214 // sshllb z20.s, z16.h, #0x0\n"
+ ".inst 0x4510a613 // sshllt z19.s, z16.h, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
- ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
- ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
- ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
- ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
- ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- "mov z19.s, #0x7f\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
- "not z16.s, p0/M, z19.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z23.s, p0/M, z23.s, z19.s\n"
- "trn1 z23.h, z1.h, z23.h\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "smin z31.s, p0/M, z31.s, z19.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x44828256 // srshl z22.s, p0/M, z22.s, z18.s\n"
+ ".inst 0x44828255 // srshl z21.s, p0/M, z21.s, z18.s\n"
+ ".inst 0x44828254 // srshl z20.s, p0/M, z20.s, z18.s\n"
+ ".inst 0x44828253 // srshl z19.s, p0/M, z19.s, z18.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
+ ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
"whilelt p4.b, x9, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
index 2cdb2883c2..29a03ec509 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
index a637654908..f0e7bbf5cc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,13 +22,14 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -109,7 +110,7 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,48 +126,48 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
@@ -203,20 +204,20 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -332,74 +333,74 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 6d5f53d7a5..3df4e4efb8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -45,3 +47,5 @@ struct sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<u
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 9f267d76ea..9088cbde89 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -70,23 +70,23 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p1.b, x15, x13\n"
+ "whilelt p0.b, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "ld1b { z25.b }, p1/Z, [x26, x15]\n"
- "ld1b { z24.b }, p1/Z, [x23, x15]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+ "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"b.none 2f\n"
@@ -95,25 +95,25 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
"ld1b { z30.b }, p1/Z, [x27, x15]\n"
"whilelt p0.b, x14, x13\n"
- "movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
- "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
+ "movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
"ld1b { z28.b }, p1/Z, [x24, x15]\n"
- "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z27.b }, p1/Z, [x21, x15]\n"
"ld1b { z29.b }, p1/Z, [x25, x15]\n"
- "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z20.b\n"
- "umax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
"ld1b { z26.b }, p1/Z, [x28, x15]\n"
- "umax z17.b, p2/M, z17.b, z21.b\n"
- "umax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
"ld1b { z25.b }, p1/Z, [x26, x15]\n"
"st1b { z19.b }, p0, [x12, x14]\n"
"ld1b { z24.b }, p1/Z, [x23, x15]\n"
"st1b { z18.b }, p0, [x11, x14]\n"
- "ld1b { z23.b }, p1/Z, [x22, x15]\n"
+ "ld1b { z19.b }, p1/Z, [x22, x15]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
- "ld1b { z19.b }, p1/Z, [x20, x15]\n"
+ "ld1b { z23.b }, p1/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -125,13 +125,13 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p0.b, x14, x13\n"
"movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
"movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
- "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z23.b\n"
- "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z19.b\n"
- "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z20.b\n"
+ "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z19.b\n"
+ "movprfx z19, z24\n umax z19.b, p2/M, z19.b, z23.b\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
"umax z18.b, p2/M, z18.b, z22.b\n"
- "st1b { z19.b }, p0, [x12, x14]\n"
+ "st1b { z16.b }, p0, [x12, x14]\n"
"umax z17.b, p2/M, z17.b, z21.b\n"
- "umax z16.b, p2/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z19.b\n"
"st1b { z18.b }, p0, [x11, x14]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
@@ -145,4 +145,4 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
index 5c637ec3c3..077c8ed2f7 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
index 9a13deafda..06f13e8111 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,9 +22,10 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -54,68 +55,68 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x0\n"
"mov z3.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x0\n"
"mov z1.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
"umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"umax z4.b, p0/M, z4.b, z19.b\n"
"umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"umax z2.b, p0/M, z2.b, z17.b\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
@@ -138,15 +139,15 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z4.b, p0/M, z4.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "umax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -166,44 +167,44 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z4.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z4.b, p0/M, z4.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z4.b, p0/M, z4.b, z19.b\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z4.b, p0/M, z4.b, z0.b\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z4.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
index 2930993800..bd30a32828 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index a2fe7a301d..52c52ccdb9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,15 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -136,7 +137,7 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -149,48 +150,48 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.d, z15.d\n"
"mov z0.d, z15.d\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
@@ -227,20 +228,20 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -393,61 +394,61 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482826f // srshl z15.s, p0/M, z15.s, z19.s\n"
- ".inst 0x4482826e // srshl z14.s, p0/M, z14.s, z19.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4482826d // srshl z13.s, p0/M, z13.s, z19.s\n"
- ".inst 0x4482826c // srshl z12.s, p0/M, z12.s, z19.s\n"
- "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
- ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
"ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
- ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
@@ -457,19 +458,19 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"add z14.s, z14.s, z16.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
"incb x27\n"
"whilelt p4.b, x27, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
index d7bf6cbd08..69d627c047 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
namespace arm_conv {
namespace pooling {
@@ -40,3 +42,5 @@ struct sme_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t,
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
index d050cd014f..c8e8e7d399 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
-#if defined(ARM_COMPUTE_ENABLE_SME)
-
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace arm_conv {
namespace pooling {
@@ -56,68 +57,68 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
"mov z3.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x0\n"
"mov z1.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
"umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
"umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"umax z5.b, p0/M, z5.b, z19.b\n"
"umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
"umax z2.b, p0/M, z2.b, z17.b\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
- "ld1b { z29.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z28.b }, p3/Z, [x21, x28]\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "ld1b { z27.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z26.b }, p2/Z, [x21, x27]\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
@@ -140,15 +141,15 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z5.b, p0/M, z5.b, z0.b\n"
- "ld1b { z18.b }, p3/Z, [x24, x28]\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z17.b }, p2/Z, [x24, x27]\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "umax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
"umax z1.b, p0/M, z1.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
@@ -313,92 +314,92 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "ldp x20, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z5.b, p0/M, z5.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
- "ld1b { z31.b }, p4/Z, [x23, x9]\n"
- "ld1b { z23.b }, p4/Z, [x22, x9]\n"
- "ld1b { z30.b }, p4/Z, [x21, x9]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z5.b, p0/M, z5.b, z19.b\n"
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z0.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z5.b, p0/M, z5.b, z0.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a8b7 // ushllb z23.h, z5.b, #0x0\n"
- ".inst 0x4508acb9 // ushllt z25.h, z5.b, #0x0\n"
- "neg z4.s, p0/M, z4.s\n"
- ".inst 0x45974081 // saddwb z1.s, z4.s, z23.h\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a8b1 // ushllb z17.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "neg z18.s, p0/M, z18.s\n"
+ ".inst 0x45914257 // saddwb z23.s, z18.s, z17.h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x45974497 // saddwt z23.s, z4.s, z23.h\n"
- ".inst 0x45994080 // saddwb z0.s, z4.s, z25.h\n"
+ "ld1rw { z22.s }, p0/Z, [x20]\n"
+ ".inst 0x45914655 // saddwt z21.s, z18.s, z17.h\n"
+ ".inst 0x45904254 // saddwb z20.s, z18.s, z16.h\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x4599449f // saddwt z31.s, z4.s, z25.h\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
"ld1rw { z19.s }, p0/Z, [x20]\n"
- ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
+ ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
+ ".inst 0x448282d7 // srshl z23.s, p0/M, z23.s, z22.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x448282d5 // srshl z21.s, p0/M, z21.s, z22.s\n"
+ ".inst 0x448282d4 // srshl z20.s, p0/M, z20.s, z22.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
- ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
- ".inst 0x04a276f7 // sqrdmulh z23.s, z23.s, z2.s\n"
- ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
- ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
- ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
- ".inst 0x44828277 // srshl z23.s, p0/M, z23.s, z19.s\n"
- ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
- ".inst 0x4482827f // srshl z31.s, p0/M, z31.s, z19.s\n"
- "add z1.s, z1.s, z16.s\n"
+ ".inst 0x448282d2 // srshl z18.s, p0/M, z18.s, z22.s\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ ".inst 0x04b37694 // sqrdmulh z20.s, z20.s, z19.s\n"
+ ".inst 0x04b37652 // sqrdmulh z18.s, z18.s, z19.s\n"
+ ".inst 0x44828237 // srshl z23.s, p0/M, z23.s, z17.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
"add z23.s, z23.s, z16.s\n"
- "add z0.s, z0.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z23.s, p0/M, z23.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z23.h, z1.h, z23.h\n"
- "smin z31.s, p0/M, z31.s, z19.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "add z21.s, z21.s, z16.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z23.s, p0/M, z23.s, z17.s\n"
+ "smax z21.s, p0/M, z21.s, z17.s\n"
+ "smax z20.s, p0/M, z20.s, z17.s\n"
+ "smax z18.s, p0/M, z18.s, z17.s\n"
+ "smin z23.s, p0/M, z23.s, z16.s\n"
+ "smin z21.s, p0/M, z21.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z16.s\n"
+ "trn1 z17.h, z23.h, z21.h\n"
+ "smin z18.s, p0/M, z18.s, z16.s\n"
+ "trn1 z16.h, z20.h, z18.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
"whilelt p4.b, x9, %x[n_channels]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 593fb58445..1ba78f3fba 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -88,8 +88,8 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"mov x20, #0x4\n"
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
"ldp x5, x6, [x21, #0x0]\n"
- "whilelt p0.h, XZR, x20\n"
- "whilelt p1.h, x3, x2\n"
+ "whilelt p2.h, XZR, x20\n"
+ "whilelt p0.h, x3, x2\n"
"ldp x7, x8, [x21, #0x10]\n"
"ldp x17, x16, [x4, #0x0]\n"
"add x15, %x[args], %[offsetof_rescale]\n"
@@ -101,25 +101,25 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x25, x24, [x4, #0x50]\n"
"ldp x23, x22, [x4, #0x60]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1h { z7.h }, p1/Z, [x10, x3, LSL #1]\n"
- "ld1h { z6.h }, p1/Z, [x9, x3, LSL #1]\n"
- "ld1h { z5.h }, p1/Z, [x26, x3, LSL #1]\n"
- "ld1h { z4.h }, p1/Z, [x25, x3, LSL #1]\n"
- "ld1h { z3.h }, p1/Z, [x16, x3, LSL #1]\n"
- "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
- "ld1h { z1.h }, p1/Z, [x11, x3, LSL #1]\n"
- "ld1h { z31.h }, p1/Z, [x27, x3, LSL #1]\n"
- "ld1h { z30.h }, p1/Z, [x28, x3, LSL #1]\n"
- "ld1h { z29.h }, p1/Z, [x24, x3, LSL #1]\n"
- "ld1h { z28.h }, p1/Z, [x22, x3, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x21, x3, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x17, x3, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "ld1h { z7.h }, p0/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z6.h }, p0/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z5.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z4.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x16, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x17, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
"whilelt p1.h, x3, x2\n"
- "ld1rqh { z0.h }, p0/Z, [x15]\n"
+ "ld1rqh { z0.h }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
"fadd z17.h, z7.h, z6.h\n"
@@ -172,32 +172,32 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"fadd z17.h, z7.h, z6.h\n"
"fadd z16.h, z5.h, z4.h\n"
"whilelt p0.h, x14, x2\n"
- "fadd z19.h, z17.h, z16.h\n"
+ "fadd z20.h, z17.h, z16.h\n"
"fadd z18.h, z3.h, z2.h\n"
"fadd z17.h, z1.h, z31.h\n"
- "fadd z22.h, z30.h, z29.h\n"
+ "fadd z19.h, z30.h, z29.h\n"
"fadd z16.h, z28.h, z27.h\n"
- "fadd z21.h, z18.h, z19.h\n"
- "fadd z20.h, z16.h, z19.h\n"
- "fadd z19.h, z26.h, z17.h\n"
- "fadd z18.h, z25.h, z22.h\n"
+ "fadd z21.h, z18.h, z20.h\n"
+ "fadd z20.h, z16.h, z20.h\n"
+ "fadd z16.h, z26.h, z17.h\n"
+ "fadd z18.h, z25.h, z19.h\n"
"fadd z17.h, z24.h, z17.h\n"
- "fadd z16.h, z23.h, z22.h\n"
- "fadd z19.h, z21.h, z19.h\n"
- "fmul z19.h, z19.h, z0.h[0]\n"
- "st1h { z19.h }, p0, [x5, x14, LSL #1]\n"
+ "fadd z19.h, z23.h, z19.h\n"
+ "fadd z16.h, z21.h, z16.h\n"
+ "fmul z16.h, z16.h, z0.h[0]\n"
+ "st1h { z16.h }, p0, [x5, x14, LSL #1]\n"
"fadd z18.h, z21.h, z18.h\n"
"fadd z17.h, z17.h, z20.h\n"
"fmul z18.h, z18.h, z0.h[1]\n"
"fmul z17.h, z17.h, z0.h[2]\n"
- "fadd z16.h, z16.h, z20.h\n"
+ "fadd z16.h, z19.h, z20.h\n"
"fmul z16.h, z16.h, z0.h[3]\n"
"st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
"st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
"st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 594c65e18d..2bef44ea5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -57,68 +57,68 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z4.b, #0x0\n"
"mov z3.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.h, z2.h, z1.h\n"
"fadd z19.h, z0.h, z31.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"fadd z22.h, z30.h, z22.h\n"
"fadd z18.h, z29.h, z28.h\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
+ "add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
"fadd z18.h, z22.h, z18.h\n"
- "ld1h { z30.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
"fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
- "ld1h { z29.h }, p2/Z, [x22, x28, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
"fadd z6.h, z6.h, z19.h\n"
"fadd z5.h, z5.h, z18.h\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
"fadd z4.h, z4.h, z17.h\n"
"fadd z3.h, z3.h, z16.h\n"
- "ld1h { z26.h }, p1/Z, [x22, x27, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x21, x27, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x23, x26, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x22, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.h, z2.h, z1.h\n"
@@ -141,16 +141,16 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.h, z6.h, z2.h\n"
- "ld1h { z30.h }, p2/Z, [x24, x28, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x24, x27, LSL #1]\n"
- "fadd z5.h, z5.h, z30.h\n"
- "fadd z4.h, z4.h, z27.h\n"
- "ld1h { z25.h }, p0/Z, [x24, x26, LSL #1]\n"
- "fadd z3.h, z3.h, z25.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fadd z5.h, z5.h, z17.h\n"
+ "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z3.h, z3.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z6.h, z6.h, z7.h\n"
@@ -173,44 +173,44 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.h, z2.h, z1.h\n"
- "fadd z19.h, z0.h, z31.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z19.h, z23.h, z19.h\n"
+ "fadd z17.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z16.h, z17.h, z16.h\n"
"subs x25, x25, #0x1\n"
- "fadd z6.h, z6.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "fadd z6.h, z6.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.h, z2.h, z1.h\n"
- "fadd z19.h, z0.h, z31.h\n"
- "fadd z19.h, z23.h, z19.h\n"
- "fadd z6.h, z6.h, z19.h\n"
+ "fadd z17.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z2.h }, p3/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.h, z6.h, z2.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z6.h, z6.h, z7.h\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 838cd3406c..31bbfd085e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.h, x14, x15\n"
+ "whilelt p0.h, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1h { z31.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x24, x14, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x21, x14, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x28, x14, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z25.h }, p2/Z, [x23, x14, LSL #1]\n"
- "ld1h { z24.h }, p2/Z, [x22, x14, LSL #1]\n"
- "ld1h { z23.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x14, LSL #1]\n"
"incw x14\n"
- "whilelt p2.h, x14, x15\n"
+ "whilelt p1.h, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n fmax z22.h, p1/M, z22.h, z30.h\n"
- "movprfx z21, z30\n fmax z21.h, p1/M, z21.h, z29.h\n"
- "ld1h { z31.h }, p2/Z, [x27, x14, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x24, x14, LSL #1]\n"
- "movprfx z20, z28\n fmax z20.h, p1/M, z20.h, z27.h\n"
- "movprfx z19, z26\n fmax z19.h, p1/M, z19.h, z25.h\n"
- "ld1h { z29.h }, p2/Z, [x21, x14, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x28, x14, LSL #1]\n"
- "movprfx z17, z28\n fmax z17.h, p1/M, z17.h, z24.h\n"
- "movprfx z18, z25\n fmax z18.h, p1/M, z18.h, z23.h\n"
- "ld1h { z28.h }, p2/Z, [x25, x14, LSL #1]\n"
- "ld1h { z26.h }, p2/Z, [x26, x14, LSL #1]\n"
- "ld1h { z25.h }, p2/Z, [x23, x14, LSL #1]\n"
- "ld1h { z24.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
+ "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
+ "ld1h { z31.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z30.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+ "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+ "ld1h { z29.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x28, x14, LSL #1]\n"
+ "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z28.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x22, x14, LSL #1]\n"
"whilelt p0.h, x11, x15\n"
- "movprfx z16, z22\n fmax z16.h, p1/M, z16.h, z20.h\n"
- "ld1h { z23.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "ld1h { z23.h }, p1/Z, [x20, x14, LSL #1]\n"
"incw x14\n"
- "whilelt p2.h, x14, x15\n"
+ "whilelt p1.h, x14, x15\n"
"st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
- "movprfx z16, z19\n fmax z16.h, p1/M, z16.h, z22.h\n"
- "fmax z17.h, p1/M, z17.h, z21.h\n"
+ "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
"st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
- "movprfx z16, z18\n fmax z16.h, p1/M, z16.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
"st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n fmax z22.h, p1/M, z22.h, z30.h\n"
- "movprfx z21, z30\n fmax z21.h, p1/M, z21.h, z29.h\n"
- "movprfx z20, z28\n fmax z20.h, p1/M, z20.h, z27.h\n"
- "movprfx z19, z26\n fmax z19.h, p1/M, z19.h, z25.h\n"
- "movprfx z17, z28\n fmax z17.h, p1/M, z17.h, z24.h\n"
- "movprfx z18, z25\n fmax z18.h, p1/M, z18.h, z23.h\n"
+ "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
+ "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
+ "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+ "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+ "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
"whilelt p0.h, x11, x15\n"
- "movprfx z16, z22\n fmax z16.h, p1/M, z16.h, z20.h\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
"st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
- "movprfx z16, z19\n fmax z16.h, p1/M, z16.h, z22.h\n"
- "fmax z17.h, p1/M, z17.h, z21.h\n"
+ "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
"st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
- "movprfx z16, z18\n fmax z16.h, p1/M, z16.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
"st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
index 9f1f9e7377..1a01412836 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.h, #0xfc00\n"
"mov z7.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.h, #0xfc00\n"
"mov z5.h, #0xfc00\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
"movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
"fmax z22.h, p0/M, z22.h, z30.h\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
"movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
"fmax z21.h, p0/M, z21.h, z27.h\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
"fmax z20.h, p0/M, z20.h, z24.h\n"
- "ld1h { z0.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
"fmax z19.h, p0/M, z19.h, z23.h\n"
"fmax z18.h, p0/M, z18.h, z22.h\n"
- "ld1h { z22.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
"fmax z17.h, p0/M, z17.h, z21.h\n"
"fmax z16.h, p0/M, z16.h, z20.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
"subs x25, x25, #0x1\n"
"fmax z8.h, p0/M, z8.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
"fmax z7.h, p0/M, z7.h, z18.h\n"
"fmax z6.h, p0/M, z6.h, z17.h\n"
- "ld1h { z26.h }, p1/Z, [x24, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
"fmax z5.h, p0/M, z5.h, z16.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z20.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
@@ -138,16 +138,16 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z4.h\n"
- "ld1h { z0.h }, p3/Z, [x24, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmax z7.h, p0/M, z7.h, z0.h\n"
- "fmax z6.h, p0/M, z6.h, z29.h\n"
- "ld1h { z26.h }, p1/Z, [x24, x26, LSL #1]\n"
- "fmax z5.h, p0/M, z5.h, z26.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z7.h, p0/M, z7.h, z17.h\n"
+ "fmax z6.h, p0/M, z6.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
@@ -166,44 +166,44 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.h, #0xfc00\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
- "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
+ "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+ "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
"subs x25, x25, #0x1\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x23, x9, LSL #1]\n"
- "fmax z8.h, p0/M, z8.h, z19.h\n"
- "add x20, x20, #0x20\n"
- "ld1h { z2.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
- "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "fmax z8.h, p0/M, z8.h, z19.h\n"
+ "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+ "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1h { z4.h }, p4/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z4.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 39197aa04d..c5ea5adea0 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -88,8 +88,8 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"mov x20, #0x4\n"
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
"ldp x5, x6, [x21, #0x0]\n"
- "whilelt p0.s, XZR, x20\n"
- "whilelt p1.s, x3, x2\n"
+ "whilelt p2.s, XZR, x20\n"
+ "whilelt p0.s, x3, x2\n"
"ldp x7, x8, [x21, #0x10]\n"
"ldp x17, x16, [x4, #0x0]\n"
"add x15, %x[args], %[offsetof_rescale]\n"
@@ -101,25 +101,25 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x25, x24, [x4, #0x50]\n"
"ldp x23, x22, [x4, #0x60]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1w { z7.s }, p1/Z, [x10, x3, LSL #2]\n"
- "ld1w { z6.s }, p1/Z, [x9, x3, LSL #2]\n"
- "ld1w { z5.s }, p1/Z, [x26, x3, LSL #2]\n"
- "ld1w { z4.s }, p1/Z, [x25, x3, LSL #2]\n"
- "ld1w { z3.s }, p1/Z, [x16, x3, LSL #2]\n"
- "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
- "ld1w { z1.s }, p1/Z, [x11, x3, LSL #2]\n"
- "ld1w { z31.s }, p1/Z, [x27, x3, LSL #2]\n"
- "ld1w { z30.s }, p1/Z, [x28, x3, LSL #2]\n"
- "ld1w { z29.s }, p1/Z, [x24, x3, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x22, x3, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x21, x3, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x17, x3, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "ld1w { z7.s }, p0/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z6.s }, p0/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z5.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z4.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x16, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x17, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
"whilelt p1.s, x3, x2\n"
- "ld1rqw { z0.s }, p0/Z, [x15]\n"
+ "ld1rqw { z0.s }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
"fadd z17.s, z7.s, z6.s\n"
@@ -172,32 +172,32 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"fadd z17.s, z7.s, z6.s\n"
"fadd z16.s, z5.s, z4.s\n"
"whilelt p0.s, x14, x2\n"
- "fadd z19.s, z17.s, z16.s\n"
+ "fadd z20.s, z17.s, z16.s\n"
"fadd z18.s, z3.s, z2.s\n"
"fadd z17.s, z1.s, z31.s\n"
- "fadd z22.s, z30.s, z29.s\n"
+ "fadd z19.s, z30.s, z29.s\n"
"fadd z16.s, z28.s, z27.s\n"
- "fadd z21.s, z18.s, z19.s\n"
- "fadd z20.s, z16.s, z19.s\n"
- "fadd z19.s, z26.s, z17.s\n"
- "fadd z18.s, z25.s, z22.s\n"
+ "fadd z21.s, z18.s, z20.s\n"
+ "fadd z20.s, z16.s, z20.s\n"
+ "fadd z16.s, z26.s, z17.s\n"
+ "fadd z18.s, z25.s, z19.s\n"
"fadd z17.s, z24.s, z17.s\n"
- "fadd z16.s, z23.s, z22.s\n"
- "fadd z19.s, z21.s, z19.s\n"
- "fmul z19.s, z19.s, z0.s[0]\n"
- "st1w { z19.s }, p0, [x5, x14, LSL #2]\n"
+ "fadd z19.s, z23.s, z19.s\n"
+ "fadd z16.s, z21.s, z16.s\n"
+ "fmul z16.s, z16.s, z0.s[0]\n"
+ "st1w { z16.s }, p0, [x5, x14, LSL #2]\n"
"fadd z18.s, z21.s, z18.s\n"
"fadd z17.s, z17.s, z20.s\n"
"fmul z18.s, z18.s, z0.s[1]\n"
"fmul z17.s, z17.s, z0.s[2]\n"
- "fadd z16.s, z16.s, z20.s\n"
+ "fadd z16.s, z19.s, z20.s\n"
"fmul z16.s, z16.s, z0.s[3]\n"
"st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
"st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
"st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index c1a3e5de84..7c94894892 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -57,68 +57,68 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z4.b, #0x0\n"
"mov z3.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"fadd z23.s, z2.s, z1.s\n"
"fadd z19.s, z0.s, z31.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"fadd z22.s, z30.s, z22.s\n"
"fadd z18.s, z29.s, z28.s\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
+ "add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
"fadd z18.s, z22.s, z18.s\n"
- "ld1w { z30.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
"fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
- "ld1w { z29.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
"fadd z6.s, z6.s, z19.s\n"
"fadd z5.s, z5.s, z18.s\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
"fadd z4.s, z4.s, z17.s\n"
"fadd z3.s, z3.s, z16.s\n"
- "ld1w { z26.s }, p1/Z, [x22, x27, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x21, x27, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x23, x26, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x22, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"fadd z23.s, z2.s, z1.s\n"
@@ -141,16 +141,16 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.s, z6.s, z2.s\n"
- "ld1w { z30.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x24, x27, LSL #2]\n"
- "fadd z5.s, z5.s, z30.s\n"
- "fadd z4.s, z4.s, z27.s\n"
- "ld1w { z25.s }, p0/Z, [x24, x26, LSL #2]\n"
- "fadd z3.s, z3.s, z25.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fadd z5.s, z5.s, z17.s\n"
+ "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z3.s, z3.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z6.s, z6.s, z7.s\n"
@@ -173,44 +173,44 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.s, z2.s, z1.s\n"
- "fadd z19.s, z0.s, z31.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fadd z19.s, z23.s, z19.s\n"
+ "fadd z17.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z16.s, z17.s, z16.s\n"
"subs x25, x25, #0x1\n"
- "fadd z6.s, z6.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.s, z2.s, z1.s\n"
- "fadd z19.s, z0.s, z31.s\n"
- "fadd z19.s, z23.s, z19.s\n"
- "fadd z6.s, z6.s, z19.s\n"
+ "fadd z17.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z2.s }, p3/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.s, z6.s, z2.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"fmul z6.s, z6.s, z7.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index da0239cea8..d9cebd1363 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.s, x14, x15\n"
+ "whilelt p0.s, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1w { z31.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x24, x14, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x21, x14, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x28, x14, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [x23, x14, LSL #2]\n"
- "ld1w { z24.s }, p2/Z, [x22, x14, LSL #2]\n"
- "ld1w { z23.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x22, x14, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
- "whilelt p2.s, x14, x15\n"
+ "whilelt p1.s, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n fmax z22.s, p1/M, z22.s, z30.s\n"
- "movprfx z21, z30\n fmax z21.s, p1/M, z21.s, z29.s\n"
- "ld1w { z31.s }, p2/Z, [x27, x14, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x24, x14, LSL #2]\n"
- "movprfx z20, z28\n fmax z20.s, p1/M, z20.s, z27.s\n"
- "movprfx z19, z26\n fmax z19.s, p1/M, z19.s, z25.s\n"
- "ld1w { z29.s }, p2/Z, [x21, x14, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x28, x14, LSL #2]\n"
- "movprfx z17, z28\n fmax z17.s, p1/M, z17.s, z24.s\n"
- "movprfx z18, z25\n fmax z18.s, p1/M, z18.s, z23.s\n"
- "ld1w { z28.s }, p2/Z, [x25, x14, LSL #2]\n"
- "ld1w { z26.s }, p2/Z, [x26, x14, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [x23, x14, LSL #2]\n"
- "ld1w { z24.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
+ "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
+ "ld1w { z31.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+ "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+ "ld1w { z29.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x28, x14, LSL #2]\n"
+ "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z28.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x22, x14, LSL #2]\n"
"whilelt p0.s, x11, x15\n"
- "movprfx z16, z22\n fmax z16.s, p1/M, z16.s, z20.s\n"
- "ld1w { z23.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "ld1w { z23.s }, p1/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
- "whilelt p2.s, x14, x15\n"
+ "whilelt p1.s, x14, x15\n"
"st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
- "movprfx z16, z19\n fmax z16.s, p1/M, z16.s, z22.s\n"
- "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
"st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
- "movprfx z16, z18\n fmax z16.s, p1/M, z16.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
"st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n fmax z22.s, p1/M, z22.s, z30.s\n"
- "movprfx z21, z30\n fmax z21.s, p1/M, z21.s, z29.s\n"
- "movprfx z20, z28\n fmax z20.s, p1/M, z20.s, z27.s\n"
- "movprfx z19, z26\n fmax z19.s, p1/M, z19.s, z25.s\n"
- "movprfx z17, z28\n fmax z17.s, p1/M, z17.s, z24.s\n"
- "movprfx z18, z25\n fmax z18.s, p1/M, z18.s, z23.s\n"
+ "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
+ "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
+ "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+ "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+ "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
"whilelt p0.s, x11, x15\n"
- "movprfx z16, z22\n fmax z16.s, p1/M, z16.s, z20.s\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
"st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
- "movprfx z16, z19\n fmax z16.s, p1/M, z16.s, z22.s\n"
- "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
"st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
- "movprfx z16, z18\n fmax z16.s, p1/M, z16.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
"st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
index ddce2be62c..87fc75adda 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.s, #0xff800000\n"
"mov z7.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.s, #0xff800000\n"
"mov z5.s, #0xff800000\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
"movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
"fmax z22.s, p0/M, z22.s, z30.s\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
"movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
"fmax z21.s, p0/M, z21.s, z27.s\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
"fmax z20.s, p0/M, z20.s, z24.s\n"
- "ld1w { z0.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
"fmax z19.s, p0/M, z19.s, z23.s\n"
"fmax z18.s, p0/M, z18.s, z22.s\n"
- "ld1w { z22.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
"fmax z17.s, p0/M, z17.s, z21.s\n"
"fmax z16.s, p0/M, z16.s, z20.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
"subs x25, x25, #0x1\n"
"fmax z8.s, p0/M, z8.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
"fmax z7.s, p0/M, z7.s, z18.s\n"
"fmax z6.s, p0/M, z6.s, z17.s\n"
- "ld1w { z26.s }, p1/Z, [x24, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
"fmax z5.s, p0/M, z5.s, z16.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z20.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
@@ -138,16 +138,16 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z4.s\n"
- "ld1w { z0.s }, p3/Z, [x24, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmax z7.s, p0/M, z7.s, z0.s\n"
- "fmax z6.s, p0/M, z6.s, z29.s\n"
- "ld1w { z26.s }, p1/Z, [x24, x26, LSL #2]\n"
- "fmax z5.s, p0/M, z5.s, z26.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z7.s, p0/M, z7.s, z17.s\n"
+ "fmax z6.s, p0/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
@@ -166,44 +166,44 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.s, #0xff800000\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
- "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
+ "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+ "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
"subs x25, x25, #0x1\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x23, x9, LSL #2]\n"
- "fmax z8.s, p0/M, z8.s, z19.s\n"
- "add x20, x20, #0x20\n"
- "ld1w { z2.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
- "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "fmax z8.s, p0/M, z8.s, z19.s\n"
+ "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+ "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1w { z4.s }, p4/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z4.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 68bd831d63..7925905e64 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -109,7 +109,7 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,42 +125,42 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
@@ -203,20 +203,20 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -332,49 +332,49 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 96e20c752e..5681cc1f3d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p0.b, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n smax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n smax z21.b, p1/M, z21.b, z29.b\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "movprfx z20, z28\n smax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n smax z19.b, p1/M, z19.b, z25.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "movprfx z17, z28\n smax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n smax z18.b, p1/M, z18.b, z23.b\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
+ "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
+ "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+ "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+ "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+ "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+ "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x22, x14]\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n smax z16.b, p1/M, z16.b, z20.b\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z23.b }, p1/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n smax z16.b, p1/M, z16.b, z22.b\n"
- "smax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n smax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n smax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n smax z21.b, p1/M, z21.b, z29.b\n"
- "movprfx z20, z28\n smax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n smax z19.b, p1/M, z19.b, z25.b\n"
- "movprfx z17, z28\n smax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n smax z18.b, p1/M, z18.b, z23.b\n"
+ "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
+ "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+ "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n smax z16.b, p1/M, z16.b, z20.b\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n smax z16.b, p1/M, z16.b, z22.b\n"
- "smax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n smax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
index 7d14edddeb..da9e1408f9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
"mov z7.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
"mov z5.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
"smax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
"smax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"smax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"smax z7.b, p0/M, z7.b, z18.b\n"
"smax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"smax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
@@ -138,16 +138,16 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "smax z7.b, p0/M, z7.b, z0.b\n"
- "smax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "smax z5.b, p0/M, z5.b, z26.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z17.b\n"
+ "smax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
@@ -166,44 +166,44 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 7161c4f389..19a3b112ad 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -128,7 +128,7 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -144,42 +144,42 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
@@ -222,20 +222,20 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508a3b5 // sshllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508a7b4 // sshllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508a373 // sshllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508a772 // sshllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508a331 // sshllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
@@ -368,57 +368,57 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508a3f7 // sshllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508a7f6 // sshllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
"ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 19209811d8..4fc1532d5a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -56,68 +56,68 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
"mov z7.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
"mov z5.b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
"smax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
"smax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"smax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"smax z7.b, p0/M, z7.b, z18.b\n"
"smax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"smax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
@@ -140,16 +140,16 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "smax z7.b, p0/M, z7.b, z0.b\n"
- "smax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "smax z5.b, p0/M, z5.b, z26.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z17.b\n"
+ "smax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
@@ -292,82 +292,82 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z4.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
- ".inst 0x4508a517 // sshllt z23.h, z8.b, #0x0\n"
+ ".inst 0x4508a512 // sshllt z18.h, z8.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4510a221 // sshllb z1.s, z17.h, #0x0\n"
- ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
+ ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4510a2e0 // sshllb z0.s, z23.h, #0x0\n"
- ".inst 0x4510a6ff // sshllt z31.s, z23.h, #0x0\n"
- ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
- ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
- ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
- ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
- ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a254 // sshllb z20.s, z18.h, #0x0\n"
+ ".inst 0x4510a653 // sshllt z19.s, z18.h, #0x0\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
+ ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
"mov z18.s, #0x7f\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
"not z16.s, p0/M, z18.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z17.s, p0/M, z17.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "smin z17.s, p0/M, z17.s, z18.s\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
- "trn1 z17.h, z1.h, z17.h\n"
- "smin z31.s, p0/M, z31.s, z18.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
index f888038a2a..f3f4950a1f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -109,7 +109,7 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -125,42 +125,42 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
@@ -203,20 +203,20 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -332,49 +332,49 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
"mov z14.s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
@@ -387,17 +387,17 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z18.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 70d308a585..8612555bfb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -66,10 +66,10 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p0.b, x14, x15\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"ldp x13, x12, [x21, #0x0]\n"
- "ptrue p1.b\n"
+ "ptrue p2.b\n"
"mov x11, #0x0\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
@@ -77,61 +77,61 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n umax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n umax z21.b, p1/M, z21.b, z29.b\n"
- "ld1b { z31.b }, p2/Z, [x27, x14]\n"
- "ld1b { z30.b }, p2/Z, [x24, x14]\n"
- "movprfx z20, z28\n umax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n umax z19.b, p1/M, z19.b, z25.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x14]\n"
- "ld1b { z27.b }, p2/Z, [x28, x14]\n"
- "movprfx z17, z28\n umax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n umax z18.b, p1/M, z18.b, z23.b\n"
- "ld1b { z28.b }, p2/Z, [x25, x14]\n"
- "ld1b { z26.b }, p2/Z, [x26, x14]\n"
- "ld1b { z25.b }, p2/Z, [x23, x14]\n"
- "ld1b { z24.b }, p2/Z, [x22, x14]\n"
+ "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
+ "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+ "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+ "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+ "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+ "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x22, x14]\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n umax z16.b, p1/M, z16.b, z20.b\n"
- "ld1b { z23.b }, p2/Z, [x20, x14]\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z23.b }, p1/Z, [x20, x14]\n"
"incw x14\n"
- "whilelt p2.b, x14, x15\n"
+ "whilelt p1.b, x14, x15\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n umax z16.b, p1/M, z16.b, z22.b\n"
- "umax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n umax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
"incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n umax z22.b, p1/M, z22.b, z30.b\n"
- "movprfx z21, z30\n umax z21.b, p1/M, z21.b, z29.b\n"
- "movprfx z20, z28\n umax z20.b, p1/M, z20.b, z27.b\n"
- "movprfx z19, z26\n umax z19.b, p1/M, z19.b, z25.b\n"
- "movprfx z17, z28\n umax z17.b, p1/M, z17.b, z24.b\n"
- "movprfx z18, z25\n umax z18.b, p1/M, z18.b, z23.b\n"
+ "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
+ "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
+ "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+ "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
"whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n umax z16.b, p1/M, z16.b, z20.b\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
"st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n umax z16.b, p1/M, z16.b, z22.b\n"
- "umax z17.b, p1/M, z17.b, z21.b\n"
+ "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
"st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z18\n umax z16.b, p1/M, z16.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
"st1b { z17.b }, p0, [x10, x11]\n"
"st1b { z16.b }, p0, [x9, x11]\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
index 34aa5a3dd6..be0eb398ae 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -54,68 +54,68 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
"mov z7.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
"umax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
"umax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"umax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"umax z7.b, p0/M, z7.b, z18.b\n"
"umax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"umax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
@@ -138,16 +138,16 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "umax z7.b, p0/M, z7.b, z0.b\n"
- "umax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "umax z5.b, p0/M, z5.b, z26.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z17.b\n"
+ "umax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
@@ -166,44 +166,44 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"st1b { z8.b }, p4, [%x[outptr], x9]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 36ac381004..e8339a2cd9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -136,7 +136,7 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -149,42 +149,42 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z1.d, z15.d\n"
"mov z0.d, z15.d\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x21, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x21, x25]\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
@@ -227,20 +227,20 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p3/Z, [x22, x26]\n"
- "ld1b { z27.b }, p2/Z, [x22, x25]\n"
- ".inst 0x4508abb5 // ushllb z21.h, z29.b, #0x0\n"
- ".inst 0x4508afb4 // ushllt z20.h, z29.b, #0x0\n"
- "ld1b { z25.b }, p1/Z, [x22, x24]\n"
- ".inst 0x4508ab73 // ushllb z19.h, z27.b, #0x0\n"
- ".inst 0x4508af72 // ushllt z18.h, z27.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x4508ab31 // ushllb z17.h, z25.b, #0x0\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
@@ -393,55 +393,55 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ldp x22, x21, [x20, #0x0]\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "add x20, x20, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z30.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ld1b { z31.b }, p4/Z, [x22, x27]\n"
- ".inst 0x4508abf7 // ushllb z23.h, z31.b, #0x0\n"
- ".inst 0x4508aff6 // ushllt z22.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[left_shift]]\n"
"ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
"ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
@@ -457,17 +457,17 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"add z14.s, z14.s, z16.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "mov z18.s, #0xff\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
+ "mov z17.s, #0x0\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "mov z16.s, #0xff\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
index a00cbc59d8..94522cdaaa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -56,68 +56,68 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
"mov z7.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
"movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
"umax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
"movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
"umax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z31.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x22, x28]\n"
- "ld1b { z30.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "ld1b { z28.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
"subs x25, x25, #0x1\n"
"umax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x22, x27]\n"
- "ld1b { z27.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
"umax z7.b, p0/M, z7.b, z18.b\n"
"umax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "ld1b { z25.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
"umax z5.b, p0/M, z5.b, z16.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x22, x26]\n"
- "ld1b { z24.b }, p1/Z, [x21, x26]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
"movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
@@ -140,103 +140,103 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
- "ld1b { z0.b }, p3/Z, [x24, x28]\n"
- "ld1b { z29.b }, p2/Z, [x24, x27]\n"
- "umax z7.b, p0/M, z7.b, z0.b\n"
- "umax z6.b, p0/M, z6.b, z29.b\n"
- "ld1b { z26.b }, p1/Z, [x24, x26]\n"
- "umax z5.b, p0/M, z5.b, z26.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z17.b\n"
+ "umax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a918 // ushllb z24.h, z8.b, #0x0\n"
- ".inst 0x4508ad17 // ushllt z23.h, z8.b, #0x0\n"
- ".inst 0x4508a8f6 // ushllb z22.h, z7.b, #0x0\n"
- ".inst 0x4508acf5 // ushllt z21.h, z7.b, #0x0\n"
- "neg z4.s, p0/M, z4.s\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508a8d4 // ushllb z20.h, z6.b, #0x0\n"
- ".inst 0x4508acd3 // ushllt z19.h, z6.b, #0x0\n"
"ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
+ ".inst 0x4508ad18 // ushllt z24.h, z8.b, #0x0\n"
+ ".inst 0x4508a8f7 // ushllb z23.h, z7.b, #0x0\n"
+ ".inst 0x4508acf6 // ushllt z22.h, z7.b, #0x0\n"
+ "neg z3.s, p0/M, z3.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ ".inst 0x4508a8d5 // ushllb z21.h, z6.b, #0x0\n"
+ ".inst 0x4508acd4 // ushllt z20.h, z6.b, #0x0\n"
+ "ld1rw { z2.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- ".inst 0x4508a8b2 // ushllb z18.h, z5.b, #0x0\n"
- ".inst 0x4508acb1 // ushllt z17.h, z5.b, #0x0\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a8b3 // ushllb z19.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- ".inst 0x45984082 // saddwb z2.s, z4.s, z24.h\n"
- ".inst 0x45984481 // saddwt z1.s, z4.s, z24.h\n"
- ".inst 0x44828062 // srshl z2.s, p0/M, z2.s, z3.s\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- ".inst 0x45974080 // saddwb z0.s, z4.s, z23.h\n"
- ".inst 0x4597449f // saddwt z31.s, z4.s, z23.h\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
- ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
- ".inst 0x4596409e // saddwb z30.s, z4.s, z22.h\n"
- ".inst 0x4596449d // saddwt z29.s, z4.s, z22.h\n"
- ".inst 0x4482807e // srshl z30.s, p0/M, z30.s, z3.s\n"
- ".inst 0x4482807d // srshl z29.s, p0/M, z29.s, z3.s\n"
- ".inst 0x4595409c // saddwb z28.s, z4.s, z21.h\n"
- ".inst 0x4595449b // saddwt z27.s, z4.s, z21.h\n"
- ".inst 0x4482807c // srshl z28.s, p0/M, z28.s, z3.s\n"
- ".inst 0x4482807b // srshl z27.s, p0/M, z27.s, z3.s\n"
- ".inst 0x4594409a // saddwb z26.s, z4.s, z20.h\n"
- ".inst 0x45944499 // saddwt z25.s, z4.s, z20.h\n"
- ".inst 0x4482807a // srshl z26.s, p0/M, z26.s, z3.s\n"
- ".inst 0x44828079 // srshl z25.s, p0/M, z25.s, z3.s\n"
- ".inst 0x45934098 // saddwb z24.s, z4.s, z19.h\n"
- ".inst 0x45934497 // saddwt z23.s, z4.s, z19.h\n"
- ".inst 0x44828078 // srshl z24.s, p0/M, z24.s, z3.s\n"
- ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
- ".inst 0x45924096 // saddwb z22.s, z4.s, z18.h\n"
- ".inst 0x45924495 // saddwt z21.s, z4.s, z18.h\n"
- ".inst 0x44828076 // srshl z22.s, p0/M, z22.s, z3.s\n"
- ".inst 0x44828075 // srshl z21.s, p0/M, z21.s, z3.s\n"
- ".inst 0x45914094 // saddwb z20.s, z4.s, z17.h\n"
- ".inst 0x45914493 // saddwt z19.s, z4.s, z17.h\n"
- ".inst 0x44828074 // srshl z20.s, p0/M, z20.s, z3.s\n"
- ".inst 0x44828073 // srshl z19.s, p0/M, z19.s, z3.s\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x04b07442 // sqrdmulh z2.s, z2.s, z16.s\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ ".inst 0x45914061 // saddwb z1.s, z3.s, z17.h\n"
+ ".inst 0x45914471 // saddwt z17.s, z3.s, z17.h\n"
+ ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
+ ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
+ ".inst 0x45984060 // saddwb z0.s, z3.s, z24.h\n"
+ ".inst 0x4598447f // saddwt z31.s, z3.s, z24.h\n"
+ ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
+ ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
+ ".inst 0x4597407e // saddwb z30.s, z3.s, z23.h\n"
+ ".inst 0x4597447d // saddwt z29.s, z3.s, z23.h\n"
+ ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
+ ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
+ ".inst 0x4596407c // saddwb z28.s, z3.s, z22.h\n"
+ ".inst 0x4596447b // saddwt z27.s, z3.s, z22.h\n"
+ ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
+ ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
+ ".inst 0x4595407a // saddwb z26.s, z3.s, z21.h\n"
+ ".inst 0x45954479 // saddwt z25.s, z3.s, z21.h\n"
+ ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
+ ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
+ ".inst 0x45944078 // saddwb z24.s, z3.s, z20.h\n"
+ ".inst 0x45944477 // saddwt z23.s, z3.s, z20.h\n"
+ ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
+ ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
+ ".inst 0x45934076 // saddwb z22.s, z3.s, z19.h\n"
+ ".inst 0x45934475 // saddwt z21.s, z3.s, z19.h\n"
+ ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
+ ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
+ ".inst 0x45904074 // saddwb z20.s, z3.s, z16.h\n"
+ ".inst 0x45904473 // saddwt z19.s, z3.s, z16.h\n"
+ ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
+ ".inst 0x44828053 // srshl z19.s, p0/M, z19.s, z2.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27631 // sqrdmulh z17.s, z17.s, z18.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x04b07400 // sqrdmulh z0.s, z0.s, z16.s\n"
- ".inst 0x04b077ff // sqrdmulh z31.s, z31.s, z16.s\n"
- ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
- ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
- ".inst 0x04b077de // sqrdmulh z30.s, z30.s, z16.s\n"
- ".inst 0x04b077bd // sqrdmulh z29.s, z29.s, z16.s\n"
- ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
- ".inst 0x4482823f // srshl z31.s, p0/M, z31.s, z17.s\n"
- ".inst 0x04b0779c // sqrdmulh z28.s, z28.s, z16.s\n"
- ".inst 0x04b0777b // sqrdmulh z27.s, z27.s, z16.s\n"
- ".inst 0x4482823e // srshl z30.s, p0/M, z30.s, z17.s\n"
- ".inst 0x4482823d // srshl z29.s, p0/M, z29.s, z17.s\n"
- ".inst 0x04b0775a // sqrdmulh z26.s, z26.s, z16.s\n"
- ".inst 0x04b07739 // sqrdmulh z25.s, z25.s, z16.s\n"
- ".inst 0x4482823c // srshl z28.s, p0/M, z28.s, z17.s\n"
- ".inst 0x4482823b // srshl z27.s, p0/M, z27.s, z17.s\n"
- ".inst 0x04b07718 // sqrdmulh z24.s, z24.s, z16.s\n"
- ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
- ".inst 0x4482823a // srshl z26.s, p0/M, z26.s, z17.s\n"
- ".inst 0x44828239 // srshl z25.s, p0/M, z25.s, z17.s\n"
- ".inst 0x04b076d6 // sqrdmulh z22.s, z22.s, z16.s\n"
- ".inst 0x04b076b5 // sqrdmulh z21.s, z21.s, z16.s\n"
- ".inst 0x44828238 // srshl z24.s, p0/M, z24.s, z17.s\n"
- ".inst 0x44828237 // srshl z23.s, p0/M, z23.s, z17.s\n"
- ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
- ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
- ".inst 0x44828236 // srshl z22.s, p0/M, z22.s, z17.s\n"
- ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
- ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
- ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x04b277ff // sqrdmulh z31.s, z31.s, z18.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828211 // srshl z17.s, p0/M, z17.s, z16.s\n"
+ ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
+ ".inst 0x04b277bd // sqrdmulh z29.s, z29.s, z18.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ ".inst 0x4482821f // srshl z31.s, p0/M, z31.s, z16.s\n"
+ ".inst 0x04b2779c // sqrdmulh z28.s, z28.s, z18.s\n"
+ ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
+ ".inst 0x4482821e // srshl z30.s, p0/M, z30.s, z16.s\n"
+ ".inst 0x4482821d // srshl z29.s, p0/M, z29.s, z16.s\n"
+ ".inst 0x04b2775a // sqrdmulh z26.s, z26.s, z18.s\n"
+ ".inst 0x04b27739 // sqrdmulh z25.s, z25.s, z18.s\n"
+ ".inst 0x4482821c // srshl z28.s, p0/M, z28.s, z16.s\n"
+ ".inst 0x4482821b // srshl z27.s, p0/M, z27.s, z16.s\n"
+ ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n"
+ ".inst 0x04b276f7 // sqrdmulh z23.s, z23.s, z18.s\n"
+ ".inst 0x4482821a // srshl z26.s, p0/M, z26.s, z16.s\n"
+ ".inst 0x44828219 // srshl z25.s, p0/M, z25.s, z16.s\n"
+ ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n"
+ ".inst 0x04b276b5 // sqrdmulh z21.s, z21.s, z18.s\n"
+ ".inst 0x44828218 // srshl z24.s, p0/M, z24.s, z16.s\n"
+ ".inst 0x44828217 // srshl z23.s, p0/M, z23.s, z16.s\n"
+ ".inst 0x04b27694 // sqrdmulh z20.s, z20.s, z18.s\n"
+ ".inst 0x04b27673 // sqrdmulh z19.s, z19.s, z18.s\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- "add z2.s, z2.s, z16.s\n"
"add z1.s, z1.s, z16.s\n"
+ "add z17.s, z17.s, z16.s\n"
"add z0.s, z0.s, z16.s\n"
"add z31.s, z31.s, z16.s\n"
"add z30.s, z30.s, z16.s\n"
@@ -252,8 +252,8 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"add z20.s, z20.s, z16.s\n"
"add z19.s, z19.s, z16.s\n"
"mov z16.s, #0x0\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
"smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
"smax z0.s, p0/M, z0.s, z16.s\n"
"smax z31.s, p0/M, z31.s, z16.s\n"
"mov z18.s, #0xff\n"
@@ -269,9 +269,9 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"smax z21.s, p0/M, z21.s, z16.s\n"
"smax z20.s, p0/M, z20.s, z16.s\n"
"smax z19.s, p0/M, z19.s, z16.s\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
"smin z1.s, p0/M, z1.s, z18.s\n"
- "trn1 z17.h, z2.h, z1.h\n"
+ "smin z17.s, p0/M, z17.s, z18.s\n"
+ "trn1 z17.h, z1.h, z17.h\n"
"smin z0.s, p0/M, z0.s, z18.s\n"
"smin z31.s, p0/M, z31.s, z18.s\n"
"trn1 z16.h, z0.h, z31.h\n"
@@ -313,91 +313,91 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "add x20, x20, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "ldp x24, x23, [x20, #0x0]\n"
- "ldp x22, x21, [x20, #0x10]\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
- "ld1b { z3.b }, p4/Z, [x23, x9]\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "add x20, x20, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x22, x9]\n"
- "ld1b { z1.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x24, [x20], #0x8\n"
- "ld1b { z4.b }, p4/Z, [x24, x9]\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z4.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a918 // ushllb z24.h, z8.b, #0x0\n"
- ".inst 0x4508ad17 // ushllt z23.h, z8.b, #0x0\n"
- "neg z4.s, p0/M, z4.s\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
+ ".inst 0x4508ad10 // ushllt z16.h, z8.b, #0x0\n"
+ "neg z18.s, p0/M, z18.s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x45984082 // saddwb z2.s, z4.s, z24.h\n"
- ".inst 0x45984481 // saddwt z1.s, z4.s, z24.h\n"
- ".inst 0x45974080 // saddwb z0.s, z4.s, z23.h\n"
- ".inst 0x4597449f // saddwt z31.s, z4.s, z23.h\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x45914255 // saddwb z21.s, z18.s, z17.h\n"
+ ".inst 0x45914654 // saddwt z20.s, z18.s, z17.h\n"
+ ".inst 0x45904253 // saddwb z19.s, z18.s, z16.h\n"
+ ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x44828062 // srshl z2.s, p0/M, z2.s, z3.s\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- ".inst 0x04b07442 // sqrdmulh z2.s, z2.s, z16.s\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
- ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
- ".inst 0x04b07400 // sqrdmulh z0.s, z0.s, z16.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ ".inst 0x04b076b5 // sqrdmulh z21.s, z21.s, z16.s\n"
+ ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
"ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x04b077ff // sqrdmulh z31.s, z31.s, z16.s\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
- ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- "add z2.s, z2.s, z16.s\n"
- ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
- ".inst 0x4482823f // srshl z31.s, p0/M, z31.s, z17.s\n"
- "add z1.s, z1.s, z16.s\n"
- "add z0.s, z0.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z19.s, z19.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
"mov z16.s, #0x0\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "mov z18.s, #0xff\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "trn1 z17.h, z2.h, z1.h\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
- "smin z31.s, p0/M, z31.s, z18.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smax z18.s, p0/M, z18.s, z16.s\n"
+ "mov z16.s, #0xff\n"
+ "smin z21.s, p0/M, z21.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z16.s\n"
+ "trn1 z17.h, z21.h, z20.h\n"
+ "smin z19.s, p0/M, z19.s, z16.s\n"
+ "smin z18.s, p0/M, z18.s, z16.s\n"
+ "trn1 z16.h, z19.h, z18.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
index 63333c8fb4..8a6e63d993 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -101,7 +101,7 @@ class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
{
auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
ws->input_buffer = ws + 1;
- ws->output_buffer = reinterpret_cast<TInput *>(ws + 1) + n_channels;
+ ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * n_channels;
// Fill the input buffer with an appropriate value
TInput fill_val = 0;
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
deleted file mode 100644
index 4aabd957cd..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "pool_common.hpp"
-
-#include <stack>
-#include <vector>
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
-{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
-
- constexpr static unsigned int input_rows(void)
- {
- return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
- }
-
- constexpr static unsigned int input_cols(void)
- {
- return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
- }
-
- size_t sizeof_input_buffer(void) const
- {
- return sizeof(TInput) * m_args.n_channels;
- }
-
- size_t sizeof_output_buffer(void) const
- {
- return sizeof(TOutput) * m_args.n_channels;
- }
-
- public:
- PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
- {
- }
-
- PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
- PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
-
- size_t get_working_size(void) const override
- {
- // We require an array of pointers for the inputs and outputs, a
- // channel-length vector in which to dump surplus output, and a
- // channel-length vector of padding values.
- return sizeof_input_buffer() + sizeof_output_buffer();
- }
-
- void execute(
- const void *const input,
- void *const output,
- void *const working_space
- ) const override
- {
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space
- );
- }
-
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space
- );
- }
-
- void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space
- ) const override
- {
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
- TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
-
- // Fill the input buffer
- const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
- ? static_cast<TInput>(0)
- : (std::numeric_limits<TInput>::has_infinity
- ? -std::numeric_limits<TInput>::infinity()
- : std::numeric_limits<TInput>::lowest());
- for (unsigned int i = 0; i < channels; i++)
- {
- input_buffer[i] = pad_value;
- }
-
- // Keep subdividing the output plane across the longest dimension until we
- // reach the size of the tile. Queue items for later processing. Note - we
- // can determine the largest size of the queue a priori from the input
- // tensor size, this would allow us to allocate memory within the working
- // space and improve performance.
- struct WorkItem
- {
- unsigned int output_i, output_j;
- unsigned int output_height, output_width;
-
- WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
- : output_i(i), output_j(j), output_height(height), output_width(width) {}
- };
-
- auto execute = [&] (const WorkItem &item) {
- // Create an array for the output pointers
- TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
- TOutput **const outptr_array = _outptr_array;
-
- // Construct the output pointer array
- {
- const auto output_pad_right = strategy::out_rows() - item.output_width;
- auto outptr_element = outptr_array;
- auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
-
- // Fill the array with pointers to the output buffer
- for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
- {
- outptr_array[i] = output_buffer;
- }
-
- // Fill in the valid portion of the array
- for (unsigned int i = 0; i < item.output_height; i++)
- {
- auto outptr_col = outptr_row;
- for (unsigned int j = 0; j < item.output_width; j++)
- {
- *(outptr_element++) = outptr_col;
- outptr_col += ld_output_col;
- }
- outptr_element += output_pad_right;
- outptr_row += ld_output_row;
- }
- }
-
- const int start_i = item.output_i * strategy::stride_rows() - padding.top;
- const int end_i = start_i + input_rows();
- const unsigned int pad_top = std::max(0, 0 - start_i);
- const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
-
- const int start_j = item.output_j * strategy::stride_cols() - padding.left;
- const int end_j = start_j + input_cols();
- const unsigned int pad_left = std::max(0, 0 - start_j);
- const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
-
- // Create an array for the input pointers
- const TInput * _inptr_array[input_rows() * input_cols()];
- const TInput **const inptr_array = _inptr_array;
- {
- const unsigned int row_padding = pad_top + pad_bottom;
- const unsigned int valid_rows = input_rows() - row_padding;
-
- const unsigned int col_padding = pad_left + pad_right;
- const unsigned int valid_cols = input_cols() - col_padding;
-
- // Fill the array with pointers to the input buffer
- for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
- {
- inptr_array[i] = input_buffer;
- }
-
- // Compute valid initial pointer
- auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
-
- // Fill in the valid portion of the input array
- auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
- for (unsigned int i = 0; i < valid_rows; i++)
- {
- auto inptr_col = inptr_row;
- for (unsigned int j = 0; j < valid_cols; j++)
- {
- *(inptr_element++) = inptr_col;
- inptr_col += ld_input_col;
- }
-
- inptr_row += ld_input_row;
- inptr_element += col_padding; // Skip the padding elements
- }
- }
-
- // Call the kernel
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
- strat.kernel(channels, inptr_array, outptr_array,
- pad_left, pad_top, pad_right, pad_bottom);
- };
-
- // Add the initial work item to the stack of work.
- std::stack<WorkItem, std::vector<WorkItem>> stack;
- stack.push(WorkItem(0, 0, output_height, output_width));
- while (!stack.empty())
- {
- // Pop an item from the stack, bisect the largest dimension and either
- // execute the resulting tiles or add them to the stack if they are too
- // large.
- const WorkItem item(stack.top());
- stack.pop();
-
- if (item.output_height <= strategy::out_rows() &&
- item.output_width <= strategy::out_cols())
- {
- execute(item);
- }
- else
- {
- // Split the largest dimension, such that we get an exact number of
- // tiles in the first partition.
- if (item.output_height >= item.output_width)
- {
- const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
- const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
-
- const unsigned int height_first = tiles_first * strategy::out_rows();
- const unsigned int height_second = item.output_height - height_first;
-
- stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
- stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
- }
- else
- {
- const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
- const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
-
- const unsigned int width_first = tiles_first * strategy::out_cols();
- const unsigned int width_second = item.output_width - width_first;
-
- stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
- stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
- }
- }
- }
- }
-};
-
-} // namespace pooling
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
index 65d9a91977..07c582059f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,6 @@
#pragma once
-#include "arm_compute/core/Error.h"
#include "depthfirst_driver.hpp"
#include "utils.hpp"
#if !defined(_WIN64) && !defined(__OpenBSD__)
@@ -208,10 +207,9 @@ class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
const unsigned int channel_start, const unsigned int channel_end,
const TensorSpec<const TInput *> &input,
const TensorSpec<TOutput *> &output,
- void *working_space
+ void *
) const override
{
- ARM_COMPUTE_UNUSED(working_space);
// Determine start position and padding
const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
deleted file mode 100644
index f3cb9a1d1f..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "pool_common.hpp"
-#include "utils.hpp"
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstGenericQuantized : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type, Requantize32>
-{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
- const Requantize32 m_requant; // Quantization parameters
-
- unsigned int input_rows(void) const
- {
- return m_args.pool_window.rows;
- }
-
- unsigned int input_cols(void) const
- {
- return m_args.pool_window.cols;
- }
-
- public:
- PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq) : m_args(args), m_requant(rq)
- {
- }
-
- PoolingDepthfirstGenericQuantized(PoolingDepthfirstGenericQuantized &) = delete;
- PoolingDepthfirstGenericQuantized &operator=(PoolingDepthfirstGenericQuantized &) = delete;
-
- size_t sizeof_input_pointer_array(void) const
- {
- return sizeof(TInput *) * input_rows() * input_cols();
- }
-
- size_t get_working_size(unsigned int num_threads) const override
- {
- return num_threads * sizeof_input_pointer_array();
- }
-
- void execute(
- const void *const input,
- void *const output,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
- }
-
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
- }
-
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- const unsigned int roundup_output_rows = roundup(output_height, num_threads);
- const unsigned int rows_per_thread = roundup_output_rows / num_threads;
- int start_out_height = static_cast<int>(thread_id * rows_per_thread);
- int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
- unsigned int start_channel = 0;
- unsigned int end_channel = channels;
- if(output_height == 1)
- {
- const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
- start_channel = thread_id * channels_per_thread;
- end_channel = std::min(start_channel + channels_per_thread, channels);
-
- // Reset start and end rows
- start_out_height = 0;
- end_out_height = output_height;
- }
-
- if(start_channel >= end_channel)
- {
- // Early exit in case of multiple threads parallelising on channels
- return;
- }
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
- TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
-
- // Grab the input pointer array
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
-
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
- {
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
-
- for (int out_i = start_out_height; out_i < end_out_height; out_i++)
- {
- const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
- const int end_in_i = start_in_i + m_args.pool_window.rows;
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-
- // Compute the number of pooling window rows which are contained in
- // either the valid region of the input tensor, or the padding.
- const auto padded_bottom = std::min<unsigned int>(
- start_in_i + m_args.pool_window.rows, height + padding.bottom
- );
- const auto n_total_rows = padded_bottom - start_in_i;
-
- for (int out_j = 0, start_in_j = -padding.left;
- out_j < static_cast<int>(output_width);
- out_j++, start_in_j += m_args.pool_stride.cols)
- {
- const int end_in_j = start_in_j + m_args.pool_window.cols;
-
- // Compute left/right padding
- const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
- // Compute the number of pooling window columns which are contained
- // in either the valid region of the input tensor, or the padding.
- const auto padded_right = std::min<unsigned int>(
- start_in_j + m_args.pool_window.cols, width + padding.right
- );
- const auto n_total_cols = padded_right - start_in_j;
-
- // Construct the input pointer array - fill in all valid points
- // contiguously.
- const TInput **ptrs = inptr_array;
- for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
- {
- // Can skip over the left padding because we will have either the
- // same or less than the previous tile.
- unsigned int j = pad_left;
- const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
- for (; j < input_cols() - pad_right; j++)
- {
- *(ptrs++) = colptr;
- colptr += ld_input_col;
- }
- }
-
- // Compute the number of valid cells
- const auto valid_rows = input_rows() - pad_top - pad_bottom;
- const auto valid_cols = input_cols() - pad_left - pad_right;
- const auto valid_cells = valid_rows * valid_cols;
- const auto cells_in_range = n_total_rows * n_total_cols;
- const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
- // Get the output pointer for this call
- TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
-
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
-#endif
- strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
- }
- }
- }
- }
-};
-
-} // namespace pooling
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 515d55c73b..2d743a4bd6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020, 2022 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,6 +57,8 @@
#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
#endif // ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
#include "kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp"
#include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
@@ -204,6 +206,30 @@ GemmImplementation<bfloat16, float>::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
),
+GemmImplementation<bfloat16, float>::with_estimate(
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_ffinterleaved_bf16fp32_mmla_8x12",
+ KernelWeightFormat::VL256_BL64,
+ [](const GemmArgs &args) { return args._ci->has_bf16(); },
+ [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_ffhybrid_bf16fp32_mmla_6x16",
+ KernelWeightFormat::VL256_BL64,
+ [](const GemmArgs &args) { return args._ci->has_bf16(); },
+ [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_ffinterleaved_bf16fp32_dot_8x12",
+ KernelWeightFormat::VL128_BL32,
+ [](const GemmArgs &args) { return args._ci->has_bf16(); },
+ [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+ [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
+),
#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
GemmImplementation<bfloat16, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index ee567a2498..44a7bb894a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -66,6 +66,10 @@
#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
#endif // ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
+#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
#include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp"
#include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp"
#include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 19c8fcadd3..5e77df7d4a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020, 2022 Arm Limited.
+ * Copyright (c) 2018-2020, 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -212,9 +212,11 @@ struct GemmImplementation<Top, Tret, Nothing> {
instantiate(instantiate) { }
};
-/* "Main" function implemented for each valid combination of types.
- * Returns a list of GEMM implementation descriptors for processing by the
- * other functions, ended by an implementation with
+/* Provides the list of implementation descriptors which is processed by the
+ * other functions.
+ *
+ * A specialised version is provided for each supported combination of types.
+ * The end of the list is indicated by a sentinel descriptor with
* method==GemmMethod::DEFAULT. */
template<typename Top, typename Tret, class OutputStage = Nothing>
const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index 18d8fc9312..aa6ecc2919 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020, 2022 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,6 +57,7 @@ const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, in
/* Explicitly instantiate the external functions for these types. */
template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
template bool has_opt_gemm<int16_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index b0a01886d2..fd20e53f60 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -186,6 +186,7 @@ const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int3
/* Explicitly instantiate the external functions for these types. */
template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
template bool has_opt_gemm<int8_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
deleted file mode 100644
index b71f390ab9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_gemm.hpp"
-#include "utils.hpp"
-
-#include "mergeresults.hpp"
-#include "transform.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND 64
-#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This implementation interleaves the source matrices in blocks - good for
-// larger matrices.
-namespace arm_gemm {
-
-template<typename strategy, typename To, typename Tr>
-class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
- typedef typename strategy::operand_type Toi;
- typedef typename strategy::result_type Tri;
-
- /* const properties set by constructor */
- const CPUInfo * const _ci;
-
- const unsigned int _Msize;
- const unsigned int _Nsize;
- const unsigned int _Ksize;
-
- const unsigned int _nbatches;
- const unsigned int _nmulti;
-
- const Activation _act;
-
- const int _maxthreads;
- int _nthreads;
-
- /* Blocking info */
- unsigned int _k_block=0;
- unsigned int _x_block=0;
-
- unsigned int _Mround_div=0;
- unsigned int _Mround=0;
- unsigned int _Nround_div=0;
- unsigned int _Nround=0;
-
- /* Working space, pretransposed buffer */
- const Toi *_B_transposed=nullptr;
- void *_working_space=nullptr;
-
- /* We will need to walk through the blocks of B in a few contexts, so
- * factor that out. */
- class blockwalker {
- private:
- /* Size loops, etc. based on our parent's configuration */
- const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent;
-
- /* K, X and multi parameters for current iteration. */
- unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
-
- unsigned int _index=0;
- bool _done=false;
- bool _newkblock=true;
- bool _newmulti=true;
-
- public:
- blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent)
- : _parent(parent)
- , _xmax { parent._Nsize }
- { }
-
- blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
- : _parent(parent)
- , _x0 { x0 }
- , _xmin { x0 }
- , _xmax { xmax }
- {
- assert(_x0 <= _xmax);
- }
-
- unsigned int xmax() {
- return std::min(_x0 + _parent._x_block, _xmax);
- }
-
- unsigned int kmax() {
- return std::min(_k0 + _parent._k_block, _parent._Ksize);
- }
-
- /* Advance to the next block, return false at the end. */
- bool advance(void) {
- if (_done) {
- return false;
- }
-
- _newkblock=false;
- _x0 += _parent._x_block;
- if (_x0 >= _xmax) {
- _x0=_xmin;
- _k0 += _parent._k_block;
- if (_k0 >= _parent._Ksize) {
- _k0=0;
- _multi++;
- if (_multi >= _parent._nmulti) {
- _done=true;
- return false;
- }
- _newmulti=true;
- }
- _newkblock=true;
- }
- _index++;
-
- return true;
- }
-
- unsigned int k0(void) { return _k0; }
- unsigned int x0(void) { return _x0; }
- unsigned int multi(void) { return _multi; }
- unsigned int index(void) { return _index; }
- bool done(void) { return _done; }
- bool newkblock(void) { return _newkblock; }
- };
-
- // A working size: One of these needed, regardless of thread count. Divided according to window.
- size_t get_a_working_size() const {
- return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
- }
-
- // As B will be pretranspose we do not need to alloc any space for it
- size_t get_b_working_size() const {
- return 0;
- }
-
- // C working size: One needed per thread.
- size_t get_c_working_size() const {
- return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
- }
-
- // Internal execute function.
- // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
- void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) {
- /* Make sure we've been set up correctly. */
- assert(_B_transposed);
- assert(_working_space);
- assert(this->_Aptr);
- assert(this->_Cptr);
-
-#ifdef CYCLE_PROFILING
- profiler prof;
-#endif
- strategy strat(_ci);
-
- /* Translate 'start' and 'end' into a position within the batches and rows. */
- const unsigned int window_per_batch = _Mround / strategy::out_height();
- unsigned int batch_0 = m_start / window_per_batch;
- unsigned int batch_end = m_end / window_per_batch;
-
- /* Compute the M values to operate on */
- unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
- unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
-
- unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start);
- unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
-
- blockwalker current(*this, n_0, n_max);
-
- int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
-
- auto c_panel_start = working_space_bytes;
- auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
-
- auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
- auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid);
-
- /* B^t is stored in interleaved panels separated by their K-block component
- * we want to store a pointer to the start of the current k-page
- * then when we come to the next k-block we just add the size of the previous to
- * this base pointer
- */
- const Toi *b_panel_start = _B_transposed;
- // b_panels stores a pointer to the start of our current block inside of the k-block
- const Toi *b_panel = b_panel_start;
-
- // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
- unsigned b_page_size = 0;
- int kern_k = 0;
- for (;!current.done();current.advance()) {
- int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
-
- if (current.newkblock()) {
- kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
- kern_k *= strat.k_unroll();
-
- unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width());
-
- b_panel_start += b_page_size;
- b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k);
- b_page_size = _Nround * kern_k;
-
- for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
- unsigned int first_m = (batch == batch_0) ? m_0 : 0;
- unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
-
- if (first_m >= last_m)
- continue;
-
- auto a_thread_panel_in = this->_Aptr
- + (batch * this->_A_batch_stride)
- + (current.multi() * this->_A_multi_stride);
-
- auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
-
- strat.transforms.PrepareA(
- a_thread_panel_out,
- a_thread_panel_in,
- this->_lda,
- first_m,
- last_m,
- current.k0(),
- current.kmax(),
- 0);
- }
- }
-
- /* Do the actual work. */
- for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
- unsigned int first_m = (batch == batch_0) ? m_0 : 0;
- unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
-
- const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
-
- if (first_m >= last_m)
- continue;
-
- for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
- unsigned int ymax = std::min(_Msize, y + strategy::out_height());
-
- strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
- a_ptr += (strategy::out_height() * kern_k);
-
- /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
- const bool first_pass = current.k0()==0;
- const bool last_pass = current.kmax()==_Ksize;
-
- auto c_panel_out = this->_Cptr
- + this->_C_batch_stride * batch
- + this->_C_multi_stride * current.multi();
-
- auto bias = (first_pass && this->_bias)
- ? this->_bias + (current.multi() * this->_bias_multi_stride)
- : nullptr;
-
- auto act = last_pass ? _act : Activation();
-
- strat.transforms.Merge(
- c_panel_out,
- c_panel,
- this->_ldc,
- y,
- ymax,
- current.x0(),
- current.xmax(),
- bias,
- act,
- !first_pass); //Append
- }
- }
-
- b_panel += (bblocks * strat.out_width() * kern_k);
- }
- }
-
- static unsigned int get_k_block_size(const GemmArgs &args) {
- // Work out blocking parameters, or override from provided GemmConfig
- if (args._cfg && args._cfg->inner_block_size) {
- return args._cfg->inner_block_size;
- }
-
- const unsigned int L1_size = args._ci->get_L1_cache_size();
- unsigned int k_block;
-
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
- // Needs to be (at least a single) multiple of the K unroll level.
- k_block /= strategy::k_unroll();
- k_block = std::max(k_block, 1U) * strategy::k_unroll();
-
- // Now tune to presented problem size; this is how many blocks we need.
- unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
-
- // So divide the space equally into that many blocks.
- k_block = iceildiv(args._Ksize, numk_blocks);
-
- // And round UP to the K unroll level required.
- k_block = iceildiv(k_block, strategy::k_unroll());
- k_block *= strategy::k_unroll();
-
- return k_block;
- }
-
-public:
- GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
- GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
-
- /* Constructor */
- GemmInterleavedPretransposed2d(const GemmArgs &args)
- : _ci(args._ci)
- , _Msize(args._Msize)
- , _Nsize(args._Nsize)
- , _Ksize(args._Ksize)
- , _nbatches(args._nbatches)
- , _nmulti(args._nmulti)
- , _act(args._act)
- , _maxthreads(args._maxthreads)
- , _nthreads(args._maxthreads)
- , _k_block(get_k_block_size(args))
- // Work out the rounded size of M - needed for some buffers.
- , _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
- , _Mround ( _Mround_div * strategy::out_height() )
-
- , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
- , _Nround ( _Nround_div * strategy::out_width() )
- {
- assert(_maxthreads > 0);
-
- const unsigned int L2_size = _ci->get_L2_cache_size();
-
- if (args._cfg && args._cfg->outer_block_size) {
- _x_block = args._cfg->outer_block_size;
- } else {
- // x_block: Work out how many rows (of length k_block) will fit in the L2
- // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * _k_block);
-
- // Needs to be (at least a single) multiple of the kernel output width.
- _x_block /= strategy::out_width();
- _x_block = std::max(_x_block, 1U) * strategy::out_width();
-
- // And tune to the presented problem size.
- unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
- _x_block = iceildiv(_Nsize, num_x_blocks);
-
- _x_block = iceildiv(_x_block, strategy::out_width());
- _x_block *= strategy::out_width();
- }
- }
-
- // Interface implementation - Compulsory functions
- ndrange_t get_window_size() const override {
- unsigned m = (_Mround / strategy::out_height()) * _nbatches;
- unsigned n = _Nround_div;
-
- return { m, n };
- }
-
- bool supports_dynamic_scheduling() const override {
- return true;
- }
-
- // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
- void set_nthreads(int nthreads) override {
- _nthreads = std::min(nthreads, _maxthreads);
- }
-
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- /* This particular GEMM implementation can only be broken up over the M & N
- * dimensions, we inform the frame work of this limitation via the get_window_size function
- */
- const auto m_start = work_range.get_position(0);
- const auto n_start = work_range.get_position(1);
- const auto m_size = work_range.get_size(0);
- const auto n_size = work_range.get_size(1);
- const auto m_end = m_start + m_size;
- const auto n_end = n_start + n_size;
-
- const auto m_threadid = thread_locator.get_position(0);
- const auto n_threadid = thread_locator.get_position(1);
-
- execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
- }
-
- std::size_t get_working_size() const override {
- /* Because we do not know how schedular will break up
- * the task, we need to ensure that alloc enough
- * space to be able to handle the case where every thread
- * is parallelised across B AND also every thrread is parallelised across A
- *
- * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
- * If we parallelise across B, then we only need 64 buffer of B and
- */
- return get_c_working_size() * _maxthreads
- + get_a_working_size() * _maxthreads
- + 64; //to account for cacheline alignment
- }
-
-
- void set_working_space(void *working_space) override {
- // Make sure everything ends up cache line aligned
- int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
- intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
-
- size_t diff=0;
-
- if (working_space_int & 0x3F) {
- diff = 0x40 - (working_space_int & 0x3F);
- }
-
- working_space_bytes += diff;
-
- _working_space = reinterpret_cast<void *>(working_space_bytes);
- }
-
- // Interface implementation - pretransposed
- bool B_is_pretransposed() const override {
- return true;
- }
-
- bool B_pretranspose_required() const override {
- return _B_transposed==nullptr;
- }
-
- // TODO: this could almost certainly be considerably simpler.
- size_t get_B_pretransposed_array_size() const override {
- size_t total=0;
- blockwalker current(*this);
-
- do {
- /* Figure out the size of each block. */
- unsigned int x_size = (current.xmax() - current.x0());
- unsigned int k_size = (current.kmax() - current.k0());
-
- /* Round sizes up as needed. */
- x_size = iceildiv(x_size, strategy::out_width());
- x_size *= strategy::out_width();
-
- k_size = iceildiv(k_size, strategy::k_unroll());
- k_size *= strategy::k_unroll();
-
- total += x_size * k_size * sizeof(Toi);
- } while (current.advance());
-
- return total;
- }
-
- void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
- blockwalker current(*this);
- Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
- _B_transposed = buffer;
- strategy strat(_ci);
-
- do {
- /* Figure out the size of each block. */
- unsigned int x_size = (current.xmax() - current.x0());
- unsigned int k_size = (current.kmax() - current.k0());
-
- /* Round sizes up as needed. */
- x_size = iceildiv(x_size, strategy::out_width());
- x_size *= strategy::out_width();
-
- k_size = iceildiv(k_size, strategy::k_unroll());
- k_size *= strategy::k_unroll();
-
- strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
- current.x0(), current.xmax(), current.k0(), current.kmax());
-
- buffer += (x_size * k_size);
- } while (current.advance());
- }
-
- void set_pretransposed_B_data(void *in_buffer) override {
- _B_transposed = reinterpret_cast<Toi *>(in_buffer);
- }
-
- // Estimate cycles for given problem given provided parameters
- static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
- unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
- unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
- unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width());
-
- uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
- uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
- uint64_t merge_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
-
- // Wide problems incur extra preparation cost, as it is done per thread.
- // Duplicate the logic the scheduler will later use to figure out how much that will affect us
- float ratio = m_blocks / static_cast<float>(n_blocks);
-
- unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5);
- unsigned int height = 1;
-
- if (ideal_height == 0) {
- height = 1;
- } else {
- for (unsigned int adj=0; adj<ideal_height; adj++) {
- const unsigned int round_down = ideal_height - adj;
- if (args._maxthreads % round_down == 0) {
- height = round_down;
- break;
- }
-
- const unsigned int round_up = ideal_height + adj;
- if (args._maxthreads % round_up == 0) {
- height = round_up;
- break;
- }
- }
- }
-
- // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height)
- prepare_bytes *= (args._maxthreads / height);
-
- float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
- float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
- float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
-
- float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
-
- // We can't thread over multis, which might be a problem in some
- // threaded cases. Penalize that here.
- float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9;
-
- if (parallelism_available < args._maxthreads) {
- total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
- }
-
- return static_cast<uint64_t>(total_cycles);
- }
-};
-
-} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 9e8907d60f..c725815859 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -80,7 +80,7 @@ static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_s8q_mopa_1VLx4VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
[](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL, int8_t, int8_t>(args, qp); }
@@ -88,7 +88,7 @@ static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_s8q_mopa_4VLx1VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
[](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL, int8_t, int8_t>(args, qp); }
@@ -96,7 +96,7 @@ static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_s8q_mopa_2VLx2VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
nullptr,
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL, int8_t, int8_t>(args, qp); }
},
@@ -265,6 +265,7 @@ const GemmImplementation<int8_t, int8_t, Requantize32> *gemm_implementation_list
template UniqueGemmCommon<int8_t, int8_t> gemm<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
template bool has_opt_gemm<int8_t, int8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
+template KernelDescription get_gemm_method<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index f93f56b57d..6254ec668d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -76,7 +76,7 @@ static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_meth
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_u8q_mopa_1VLx4VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
[](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<uint32_t>();
return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL, uint8_t, uint8_t>(args, qp); }
@@ -84,7 +84,7 @@ static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_meth
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_u8q_mopa_4VLx1VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
[](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL, uint8_t, uint8_t>(args, qp); }
@@ -92,7 +92,7 @@ static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_meth
{
GemmMethod::GEMM_INTERLEAVED,
"sme2_interleaved_nomerge_u8q_mopa_2VLx2VL",
- [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
nullptr,
[](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL, uint8_t, uint8_t>(args, qp); }
},
@@ -233,6 +233,7 @@ const GemmImplementation<uint8_t, uint8_t, Requantize32> *gemm_implementation_li
template UniqueGemmCommon<uint8_t, uint8_t> gemm<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
template bool has_opt_gemm<uint8_t, uint8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
+template KernelDescription get_gemm_method<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index fc836f9790..25b6cf0cf2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020, 2022 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,6 +57,7 @@ const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t,
/* Explicitly instantiate the external functions for these types. */
template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
template bool has_opt_gemm<uint16_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index fcc95eb503..af5cfbbf2b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -150,6 +150,7 @@ const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, u
/* Explicitly instantiate the external functions for these types. */
template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
template bool has_opt_gemm<uint8_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t, Nothing> (const GemmArgs &args, const Nothing &);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
index 4dfe46446e..e4bfc0f6e4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -170,7 +170,6 @@ void interleave_block<4, 16, VLType::None, false>(
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"12:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "x20", "x21", "x22", "x23"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
index 56ca49a36e..23800edf20 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -210,8 +210,8 @@ void interleave_block<4, 16, VLType::None, true>(
"sadalp v22.4s, v26.8h\n"
"sadalp v21.4s, v25.8h\n"
"addp v24.4s, v24.4s, v23.4s\n"
- "addp v23.4s, v22.4s, v21.4s\n"
- "addp v24.4s, v24.4s, v23.4s\n"
+ "addp v16.4s, v22.4s, v21.4s\n"
+ "addp v24.4s, v24.4s, v16.4s\n"
"add v24.4s, v24.4s, v20.4s\n"
"str q24, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
index 4c7bb71fb2..15545c24db 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -210,8 +210,8 @@ void interleave_block<4, 16, VLType::None, true>(
"uadalp v22.4s, v26.8h\n"
"uadalp v21.4s, v25.8h\n"
"addp v24.4s, v24.4s, v23.4s\n"
- "addp v23.4s, v22.4s, v21.4s\n"
- "addp v24.4s, v24.4s, v23.4s\n"
+ "addp v16.4s, v22.4s, v21.4s\n"
+ "addp v24.4s, v24.4s, v16.4s\n"
"add v24.4s, v24.4s, v20.4s\n"
"str q24, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
index 2ba2aa854a..b900c330b7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -80,36 +80,36 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d28, [x28], #0x8\n"
- "ldr d27, [x27], #0x8\n"
- "shll v28.4s, v28.4h, #0x10\n"
+ "ldr d27, [x28], #0x8\n"
+ "ldr d26, [x27], #0x8\n"
"shll v27.4s, v27.4h, #0x10\n"
+ "shll v26.4s, v26.4h, #0x10\n"
"ldr d22, [x26], #0x8\n"
"ldr d21, [x25], #0x8\n"
"shll v22.4s, v22.4h, #0x10\n"
"shll v21.4s, v21.4h, #0x10\n"
- "ldr d26, [x24], #0x8\n"
+ "ldr d20, [x24], #0x8\n"
"ldr d25, [x23], #0x8\n"
- "shll v26.4s, v26.4h, #0x10\n"
- "shll v25.4s, v25.4h, #0x10\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
"shll v20.4s, v20.4h, #0x10\n"
+ "shll v25.4s, v25.4h, #0x10\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
"shll v19.4s, v19.4h, #0x10\n"
- "zip1 v24.4s, v28.4s, v22.4s\n"
- "zip1 v23.4s, v27.4s, v21.4s\n"
+ "shll v16.4s, v16.4h, #0x10\n"
+ "zip1 v24.4s, v27.4s, v22.4s\n"
+ "zip1 v23.4s, v26.4s, v21.4s\n"
"subs %x[width], %x[width], #0x4\n"
"cmp %x[width], #0x4\n"
- "zip1 v18.4s, v26.4s, v20.4s\n"
- "zip1 v17.4s, v25.4s, v19.4s\n"
+ "zip1 v18.4s, v20.4s, v19.4s\n"
+ "zip1 v17.4s, v25.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "zip2 v21.4s, v26.4s, v21.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip2 v20.4s, v26.4s, v20.4s\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
+ "zip2 v20.4s, v20.4s, v19.4s\n"
+ "zip2 v19.4s, v25.4s, v16.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"prfm pldl1keep, [x22, #0x70]\n"
@@ -138,71 +138,70 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr s28, [x28], #0x4\n"
"ldr s27, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
- "ldr s26, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s24, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #0, 5f\n"
"ld1 { v28.h }[2], [x28]\n"
"ld1 { v27.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v22.h }[2], [x26]\n"
- "ld1 { v21.h }[2], [x25]\n"
- "ld1 { v26.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v20.h }[2], [x22]\n"
- "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v26.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v21.h }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
"ldr h28, [x28, #0x0]\n"
"ldr h27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h22, [x26, #0x0]\n"
- "ldr h21, [x25, #0x0]\n"
- "ldr h26, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h20, [x22, #0x0]\n"
- "ldr h19, [x21, #0x0]\n"
+ "ldr h26, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
+ "ldr h24, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h21, [x21, #0x0]\n"
"5:" // Odd load end
"shll v28.4s, v28.4h, #0x10\n"
"shll v27.4s, v27.4h, #0x10\n"
"subs x20, x20, #0x1\n"
- "shll v22.4s, v22.4h, #0x10\n"
- "shll v21.4s, v21.4h, #0x10\n"
"shll v26.4s, v26.4h, #0x10\n"
"shll v25.4s, v25.4h, #0x10\n"
- "shll v20.4s, v20.4h, #0x10\n"
- "shll v19.4s, v19.4h, #0x10\n"
- "zip1 v24.4s, v28.4s, v22.4s\n"
- "zip1 v23.4s, v27.4s, v21.4s\n"
- "zip1 v18.4s, v26.4s, v20.4s\n"
- "zip1 v17.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v24.4s, v23.4s\n"
+ "shll v24.4s, v24.4h, #0x10\n"
+ "shll v23.4s, v23.4h, #0x10\n"
+ "shll v22.4s, v22.4h, #0x10\n"
+ "shll v21.4s, v21.4h, #0x10\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
"zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.4s, v24.4s, v23.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v17.4s, v18.4s, v17.4s\n"
- "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "zip2 v20.4s, v26.4s, v20.4s\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v22.4s, v21.4s\n"
+ "zip2 v19.4s, v28.4s, v26.4s\n"
+ "zip2 v16.4s, v27.4s, v25.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v20.4s, v19.4s\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
index f55c2be4a4..e54b3b9f41 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -80,33 +80,33 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q25, [x28], #0x10\n"
- "ldr q30, [x27], #0x10\n"
+ "ldr q27, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
"cmp %x[width], #0x8\n"
- "ldr q29, [x26], #0x10\n"
- "ldr q28, [x25], #0x10\n"
+ "ldr q26, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
"ldr q21, [x24], #0x10\n"
- "ldr q27, [x23], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
"zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v26.8h, v30.8h, v27.8h\n"
- "ldr q20, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "zip1 v19.8h, v29.8h, v20.8h\n"
- "zip1 v18.8h, v28.8h, v22.8h\n"
+ "zip1 v22.8h, v27.8h, v20.8h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x21], #0x10\n"
+ "zip1 v19.8h, v26.8h, v17.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
"zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v22.8h\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
"zip1 v24.8h, v23.8h, v19.8h\n"
- "zip1 v17.8h, v26.8h, v18.8h\n"
+ "zip1 v17.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v19.8h, v26.8h, v18.8h\n"
+ "zip2 v19.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
"zip1 v22.8h, v25.8h, v21.8h\n"
@@ -134,132 +134,131 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "ldr d29, [x27], #0x8\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v25.s }[2], [x28], #0x4\n"
- "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v30.s }[2], [x28], #0x4\n"
+ "ld1 { v29.s }[2], [x27], #0x4\n"
"mov x20, #0x6\n"
- "ld1 { v29.s }[2], [x26], #0x4\n"
- "ld1 { v28.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
- "ld1 { v20.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v28.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x24], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v23.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.h }[6], [x28]\n"
- "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v30.h }[6], [x28]\n"
+ "ld1 { v29.h }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.h }[6], [x26]\n"
- "ld1 { v28.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "ld1 { v20.h }[6], [x22]\n"
- "ld1 { v22.h }[6], [x21]\n"
+ "ld1 { v28.h }[6], [x26]\n"
+ "ld1 { v27.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x24]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v23.h }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.h }[4], [x28]\n"
- "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v30.h }[4], [x28]\n"
+ "ld1 { v29.h }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.h }[4], [x26]\n"
- "ld1 { v28.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "ld1 { v20.h }[4], [x22]\n"
- "ld1 { v22.h }[4], [x21]\n"
+ "ld1 { v28.h }[4], [x26]\n"
+ "ld1 { v27.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x24]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v23.h }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr s25, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.h }[2], [x28]\n"
- "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v30.h }[2], [x28]\n"
+ "ld1 { v29.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.h }[2], [x26]\n"
- "ld1 { v28.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "ld1 { v20.h }[2], [x22]\n"
- "ld1 { v22.h }[2], [x21]\n"
+ "ld1 { v28.h }[2], [x26]\n"
+ "ld1 { v27.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x24]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr h25, [x28, #0x0]\n"
- "ldr h30, [x27, #0x0]\n"
+ "ldr h30, [x28, #0x0]\n"
+ "ldr h29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h29, [x26, #0x0]\n"
- "ldr h28, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "ldr h20, [x22, #0x0]\n"
- "ldr h22, [x21, #0x0]\n"
+ "ldr h28, [x26, #0x0]\n"
+ "ldr h27, [x25, #0x0]\n"
+ "ldr h26, [x24, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
"7:" // Odd load end
- "zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v19.8h, v29.8h, v20.8h\n"
+ "zip1 v22.8h, v30.8h, v26.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip1 v26.8h, v30.8h, v27.8h\n"
- "zip1 v18.8h, v28.8h, v22.8h\n"
- "zip1 v24.8h, v23.8h, v19.8h\n"
- "zip1 v17.8h, v26.8h, v18.8h\n"
- "zip1 v16.8h, v24.8h, v17.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v24.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v19.8h, v26.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
- "zip1 v17.8h, v23.8h, v19.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v22.8h\n"
- "zip1 v22.8h, v25.8h, v21.8h\n"
- "zip1 v18.8h, v20.8h, v16.8h\n"
- "zip1 v19.8h, v22.8h, v18.8h\n"
- "str q19, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v18.8h, v22.8h, v18.8h\n"
- "str q18, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v21.8h, v25.8h, v21.8h\n"
- "zip2 v20.8h, v20.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
index f64db0b476..3a5dcf4a6b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -79,36 +79,36 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d28, [x28], #0x8\n"
- "ldr d27, [x27], #0x8\n"
- "fcvtl v28.4s, v28.4h\n"
+ "ldr d27, [x28], #0x8\n"
+ "ldr d26, [x27], #0x8\n"
"fcvtl v27.4s, v27.4h\n"
+ "fcvtl v26.4s, v26.4h\n"
"ldr d22, [x26], #0x8\n"
"ldr d21, [x25], #0x8\n"
"fcvtl v22.4s, v22.4h\n"
"fcvtl v21.4s, v21.4h\n"
- "ldr d26, [x24], #0x8\n"
+ "ldr d20, [x24], #0x8\n"
"ldr d25, [x23], #0x8\n"
- "fcvtl v26.4s, v26.4h\n"
- "fcvtl v25.4s, v25.4h\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
"fcvtl v20.4s, v20.4h\n"
+ "fcvtl v25.4s, v25.4h\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
"fcvtl v19.4s, v19.4h\n"
- "zip1 v24.4s, v28.4s, v22.4s\n"
- "zip1 v23.4s, v27.4s, v21.4s\n"
+ "fcvtl v16.4s, v16.4h\n"
+ "zip1 v24.4s, v27.4s, v22.4s\n"
+ "zip1 v23.4s, v26.4s, v21.4s\n"
"subs %x[width], %x[width], #0x4\n"
"cmp %x[width], #0x4\n"
- "zip1 v18.4s, v26.4s, v20.4s\n"
- "zip1 v17.4s, v25.4s, v19.4s\n"
+ "zip1 v18.4s, v20.4s, v19.4s\n"
+ "zip1 v17.4s, v25.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "zip2 v21.4s, v26.4s, v21.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip2 v20.4s, v26.4s, v20.4s\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
+ "zip2 v20.4s, v20.4s, v19.4s\n"
+ "zip2 v19.4s, v25.4s, v16.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"prfm pldl1keep, [x22, #0x70]\n"
@@ -137,71 +137,70 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr s28, [x28], #0x4\n"
"ldr s27, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
- "ldr s26, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s24, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #0, 5f\n"
"ld1 { v28.h }[2], [x28]\n"
"ld1 { v27.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v22.h }[2], [x26]\n"
- "ld1 { v21.h }[2], [x25]\n"
- "ld1 { v26.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v20.h }[2], [x22]\n"
- "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v26.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v21.h }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
"ldr h28, [x28, #0x0]\n"
"ldr h27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h22, [x26, #0x0]\n"
- "ldr h21, [x25, #0x0]\n"
- "ldr h26, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h20, [x22, #0x0]\n"
- "ldr h19, [x21, #0x0]\n"
+ "ldr h26, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
+ "ldr h24, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h21, [x21, #0x0]\n"
"5:" // Odd load end
"fcvtl v28.4s, v28.4h\n"
"fcvtl v27.4s, v27.4h\n"
"subs x20, x20, #0x1\n"
- "fcvtl v22.4s, v22.4h\n"
- "fcvtl v21.4s, v21.4h\n"
"fcvtl v26.4s, v26.4h\n"
"fcvtl v25.4s, v25.4h\n"
- "fcvtl v20.4s, v20.4h\n"
- "fcvtl v19.4s, v19.4h\n"
- "zip1 v24.4s, v28.4s, v22.4s\n"
- "zip1 v23.4s, v27.4s, v21.4s\n"
- "zip1 v18.4s, v26.4s, v20.4s\n"
- "zip1 v17.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v24.4s, v23.4s\n"
+ "fcvtl v24.4s, v24.4h\n"
+ "fcvtl v23.4s, v23.4h\n"
+ "fcvtl v22.4s, v22.4h\n"
+ "fcvtl v21.4s, v21.4h\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
"zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.4s, v24.4s, v23.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v17.4s, v18.4s, v17.4s\n"
- "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "zip2 v20.4s, v26.4s, v20.4s\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v22.4s, v21.4s\n"
+ "zip2 v19.4s, v28.4s, v26.4s\n"
+ "zip2 v16.4s, v27.4s, v25.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v20.4s, v19.4s\n"
- "str q18, [%x[out_ptr], #0x10]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
index 6c009b34b8..80c387db47 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -79,29 +79,29 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q28, [x28], #0x10\n"
- "ldr q27, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q18, [x27], #0x10\n"
"subs %x[width], %x[width], #0x4\n"
"cmp %x[width], #0x4\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip1 v26.4s, v28.4s, v22.4s\n"
- "zip1 v25.4s, v27.4s, v21.4s\n"
- "ldr q24, [x24], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "ldr q16, [x25], #0x10\n"
+ "zip1 v25.4s, v20.4s, v17.4s\n"
+ "zip1 v24.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x24], #0x10\n"
"ldr q23, [x23], #0x10\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "ldr q19, [x22], #0x10\n"
- "ldr q18, [x21], #0x10\n"
- "zip1 v20.4s, v24.4s, v19.4s\n"
- "zip1 v17.4s, v23.4s, v18.4s\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v18.4s, v23.4s, v18.4s\n"
+ "zip2 v22.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x22], #0x10\n"
+ "ldr q16, [x21], #0x10\n"
+ "zip1 v20.4s, v19.4s, v18.4s\n"
+ "zip1 v17.4s, v23.4s, v16.4s\n"
+ "zip2 v19.4s, v19.4s, v18.4s\n"
+ "zip2 v18.4s, v23.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v16.4s, v26.4s, v25.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
@@ -109,7 +109,7 @@ void interleave_block<8, 1, VLType::None, false>(
"str q16, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip2 v16.4s, v26.4s, v25.4s\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
"str q16, [%x[out_ptr], #0x20]\n"
"zip2 v16.4s, v20.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x30]\n"
@@ -129,63 +129,62 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr d28, [x28], #0x8\n"
"ldr d27, [x27], #0x8\n"
"mov x20, #0x2\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
"ldr d24, [x24], #0x8\n"
"ldr d23, [x23], #0x8\n"
- "ldr d19, [x22], #0x8\n"
- "ldr d18, [x21], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
"tbz %x[width], #0, 5f\n"
"ld1 { v28.s }[2], [x28]\n"
"ld1 { v27.s }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v22.s }[2], [x26]\n"
- "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v25.s }[2], [x25]\n"
"ld1 { v24.s }[2], [x24]\n"
"ld1 { v23.s }[2], [x23]\n"
- "ld1 { v19.s }[2], [x22]\n"
- "ld1 { v18.s }[2], [x21]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
"ldr s28, [x28, #0x0]\n"
"ldr s27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr s22, [x26, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s25, [x25, #0x0]\n"
"ldr s24, [x24, #0x0]\n"
"ldr s23, [x23, #0x0]\n"
- "ldr s19, [x22, #0x0]\n"
- "ldr s18, [x21, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
"5:" // Odd load end
- "zip1 v26.4s, v28.4s, v22.4s\n"
- "zip1 v25.4s, v27.4s, v21.4s\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v20.4s, v24.4s, v19.4s\n"
- "zip1 v17.4s, v23.4s, v18.4s\n"
- "zip1 v16.4s, v26.4s, v25.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v16.4s, v20.4s, v17.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.4s, v26.4s, v25.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.4s, v20.4s, v17.4s\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v18.4s, v23.4s, v18.4s\n"
- "zip1 v16.4s, v22.4s, v21.4s\n"
+ "zip2 v19.4s, v28.4s, v26.4s\n"
+ "zip2 v16.4s, v27.4s, v25.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v16.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
index 767d468ad1..8e06b7ecab 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -80,33 +80,33 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q25, [x28], #0x10\n"
- "ldr q30, [x27], #0x10\n"
+ "ldr q27, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
"cmp %x[width], #0x8\n"
- "ldr q29, [x26], #0x10\n"
- "ldr q28, [x25], #0x10\n"
+ "ldr q26, [x26], #0x10\n"
+ "ldr q24, [x25], #0x10\n"
"ldr q21, [x24], #0x10\n"
- "ldr q27, [x23], #0x10\n"
+ "ldr q20, [x23], #0x10\n"
"zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v26.8h, v30.8h, v27.8h\n"
- "ldr q20, [x22], #0x10\n"
- "ldr q22, [x21], #0x10\n"
- "zip1 v19.8h, v29.8h, v20.8h\n"
- "zip1 v18.8h, v28.8h, v22.8h\n"
+ "zip1 v22.8h, v27.8h, v20.8h\n"
+ "ldr q17, [x22], #0x10\n"
+ "ldr q16, [x21], #0x10\n"
+ "zip1 v19.8h, v26.8h, v17.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
"zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v22.8h\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
"zip1 v24.8h, v23.8h, v19.8h\n"
- "zip1 v17.8h, v26.8h, v18.8h\n"
+ "zip1 v17.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v19.8h, v26.8h, v18.8h\n"
+ "zip2 v19.8h, v22.8h, v18.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
"zip1 v22.8h, v25.8h, v21.8h\n"
@@ -134,132 +134,131 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr d25, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d22, [x21], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "ldr d29, [x27], #0x8\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v25.s }[2], [x28], #0x4\n"
- "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v30.s }[2], [x28], #0x4\n"
+ "ld1 { v29.s }[2], [x27], #0x4\n"
"mov x20, #0x6\n"
- "ld1 { v29.s }[2], [x26], #0x4\n"
- "ld1 { v28.s }[2], [x25], #0x4\n"
- "ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
- "ld1 { v20.s }[2], [x22], #0x4\n"
- "ld1 { v22.s }[2], [x21], #0x4\n"
+ "ld1 { v28.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x24], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v23.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.h }[6], [x28]\n"
- "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v30.h }[6], [x28]\n"
+ "ld1 { v29.h }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.h }[6], [x26]\n"
- "ld1 { v28.h }[6], [x25]\n"
- "ld1 { v21.h }[6], [x24]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "ld1 { v20.h }[6], [x22]\n"
- "ld1 { v22.h }[6], [x21]\n"
+ "ld1 { v28.h }[6], [x26]\n"
+ "ld1 { v27.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x24]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v23.h }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.h }[4], [x28]\n"
- "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v30.h }[4], [x28]\n"
+ "ld1 { v29.h }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.h }[4], [x26]\n"
- "ld1 { v28.h }[4], [x25]\n"
- "ld1 { v21.h }[4], [x24]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "ld1 { v20.h }[4], [x22]\n"
- "ld1 { v22.h }[4], [x21]\n"
+ "ld1 { v28.h }[4], [x26]\n"
+ "ld1 { v27.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x24]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v23.h }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr s25, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s22, [x21], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.h }[2], [x28]\n"
- "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v30.h }[2], [x28]\n"
+ "ld1 { v29.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.h }[2], [x26]\n"
- "ld1 { v28.h }[2], [x25]\n"
- "ld1 { v21.h }[2], [x24]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "ld1 { v20.h }[2], [x22]\n"
- "ld1 { v22.h }[2], [x21]\n"
+ "ld1 { v28.h }[2], [x26]\n"
+ "ld1 { v27.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x24]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr h25, [x28, #0x0]\n"
- "ldr h30, [x27, #0x0]\n"
+ "ldr h30, [x28, #0x0]\n"
+ "ldr h29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h29, [x26, #0x0]\n"
- "ldr h28, [x25, #0x0]\n"
- "ldr h21, [x24, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "ldr h20, [x22, #0x0]\n"
- "ldr h22, [x21, #0x0]\n"
+ "ldr h28, [x26, #0x0]\n"
+ "ldr h27, [x25, #0x0]\n"
+ "ldr h26, [x24, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
"7:" // Odd load end
- "zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v19.8h, v29.8h, v20.8h\n"
+ "zip1 v22.8h, v30.8h, v26.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip1 v26.8h, v30.8h, v27.8h\n"
- "zip1 v18.8h, v28.8h, v22.8h\n"
- "zip1 v24.8h, v23.8h, v19.8h\n"
- "zip1 v17.8h, v26.8h, v18.8h\n"
- "zip1 v16.8h, v24.8h, v17.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v24.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v23.8h, v23.8h, v19.8h\n"
- "zip2 v19.8h, v26.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
- "zip1 v17.8h, v23.8h, v19.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v22.8h\n"
- "zip1 v22.8h, v25.8h, v21.8h\n"
- "zip1 v18.8h, v20.8h, v16.8h\n"
- "zip1 v19.8h, v22.8h, v18.8h\n"
- "str q19, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v18.8h, v22.8h, v18.8h\n"
- "str q18, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v21.8h, v25.8h, v21.8h\n"
- "zip2 v20.8h, v20.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
index a73792036a..b91ae8a948 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -159,101 +159,101 @@ void interleave_block<8, 1, VLType::None, true>(
"5:" // Main loop skip
"cbz %x[width], 10f\n"
"tbz %x[width], #2, 7f\n"
- "ldr d31, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d27, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "ldr d29, [x27], #0x8\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
"ldr d24, [x22], #0x8\n"
"ldr d23, [x21], #0x8\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v31.s }[2], [x28], #0x4\n"
- "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v30.s }[2], [x28], #0x4\n"
+ "ld1 { v29.s }[2], [x27], #0x4\n"
"mov x20, #0x6\n"
- "ld1 { v29.s }[2], [x26], #0x4\n"
- "ld1 { v28.s }[2], [x25], #0x4\n"
- "ld1 { v27.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v28.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x24], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
"ld1 { v24.s }[2], [x22], #0x4\n"
"ld1 { v23.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.h }[6], [x28]\n"
- "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v30.h }[6], [x28]\n"
+ "ld1 { v29.h }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.h }[6], [x26]\n"
- "ld1 { v28.h }[6], [x25]\n"
- "ld1 { v27.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v28.h }[6], [x26]\n"
+ "ld1 { v27.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x24]\n"
+ "ld1 { v25.h }[6], [x23]\n"
"ld1 { v24.h }[6], [x22]\n"
"ld1 { v23.h }[6], [x21]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.h }[4], [x28]\n"
- "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v30.h }[4], [x28]\n"
+ "ld1 { v29.h }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.h }[4], [x26]\n"
- "ld1 { v28.h }[4], [x25]\n"
- "ld1 { v27.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v28.h }[4], [x26]\n"
+ "ld1 { v27.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x24]\n"
+ "ld1 { v25.h }[4], [x23]\n"
"ld1 { v24.h }[4], [x22]\n"
"ld1 { v23.h }[4], [x21]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
- "ldr s31, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
"ldr s24, [x22], #0x4\n"
"ldr s23, [x21], #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.h }[2], [x28]\n"
- "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v30.h }[2], [x28]\n"
+ "ld1 { v29.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.h }[2], [x26]\n"
- "ld1 { v28.h }[2], [x25]\n"
- "ld1 { v27.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v28.h }[2], [x26]\n"
+ "ld1 { v27.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x24]\n"
+ "ld1 { v25.h }[2], [x23]\n"
"ld1 { v24.h }[2], [x22]\n"
"ld1 { v23.h }[2], [x21]\n"
"b 9f\n"
"8:" // odd_loads_1_0
- "ldr h31, [x28, #0x0]\n"
- "ldr h30, [x27, #0x0]\n"
+ "ldr h30, [x28, #0x0]\n"
+ "ldr h29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h29, [x26, #0x0]\n"
- "ldr h28, [x25, #0x0]\n"
- "ldr h27, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
+ "ldr h28, [x26, #0x0]\n"
+ "ldr h27, [x25, #0x0]\n"
+ "ldr h26, [x24, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
"ldr h24, [x22, #0x0]\n"
"ldr h23, [x21, #0x0]\n"
"9:" // Odd load end
- "zip1 v25.8h, v31.8h, v27.8h\n"
- "zip1 v18.8h, v29.8h, v24.8h\n"
- "subs x20, x20, #0x1\n"
"zip1 v22.8h, v30.8h, v26.8h\n"
- "zip1 v21.8h, v28.8h, v23.8h\n"
- "zip1 v17.8h, v25.8h, v18.8h\n"
- "zip1 v16.8h, v22.8h, v21.8h\n"
- "zip1 v20.8h, v17.8h, v16.8h\n"
- "str q20, [%x[out_ptr], #0x0]\n"
- "add v2.8h, v2.8h, v20.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "subs x20, x20, #0x1\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v19.8h, v17.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"subs x20, x20, #0x1\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "add v2.8h, v2.8h, v19.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v25.8h, v18.8h\n"
- "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -266,11 +266,11 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v22.8h, v31.8h, v27.8h\n"
- "zip2 v21.8h, v29.8h, v24.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v26.8h\n"
- "zip2 v19.8h, v28.8h, v23.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
"zip1 v18.8h, v22.8h, v21.8h\n"
"zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
@@ -284,9 +284,9 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v22.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
index 4a38187638..c41120c698 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -80,35 +80,35 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr d25, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
"sshll v25.8h, v25.8b, #0x0\n"
- "sshll v30.8h, v30.8b, #0x0\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "sshll v29.8h, v29.8b, #0x0\n"
- "sshll v28.8h, v28.8b, #0x0\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "sshll v26.8h, v26.8b, #0x0\n"
+ "sshll v24.8h, v24.8b, #0x0\n"
"ldr d21, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"sshll v21.8h, v21.8b, #0x0\n"
- "sshll v27.8h, v27.8b, #0x0\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
"sshll v20.8h, v20.8b, #0x0\n"
- "sshll v26.8h, v26.8b, #0x0\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
+ "sshll v17.8h, v17.8b, #0x0\n"
+ "sshll v16.8h, v16.8b, #0x0\n"
"zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v22.8h, v29.8h, v20.8h\n"
+ "zip1 v22.8h, v26.8h, v17.8h\n"
"subs %x[width], %x[width], #0x8\n"
"cmp %x[width], #0x8\n"
- "zip1 v19.8h, v30.8h, v27.8h\n"
- "zip1 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v19.8h, v27.8h, v20.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v26.8h\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"zip1 v24.8h, v23.8h, v22.8h\n"
@@ -142,140 +142,139 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr s25, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v25.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
"mov x20, #0x6\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v21.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v20.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v21.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v20.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v21.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v20.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr h25, [x28], #0x2\n"
- "ldr h30, [x27], #0x2\n"
+ "ldr h30, [x28], #0x2\n"
+ "ldr h29, [x27], #0x2\n"
"mov x20, #0x2\n"
- "ldr h29, [x26], #0x2\n"
- "ldr h28, [x25], #0x2\n"
- "ldr h21, [x24], #0x2\n"
- "ldr h27, [x23], #0x2\n"
- "ldr h20, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
+ "ldr h28, [x26], #0x2\n"
+ "ldr h27, [x25], #0x2\n"
+ "ldr h26, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h24, [x22], #0x2\n"
+ "ldr h23, [x21], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v21.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v20.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr b25, [x28, #0x0]\n"
- "ldr b30, [x27, #0x0]\n"
+ "ldr b30, [x28, #0x0]\n"
+ "ldr b29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b29, [x26, #0x0]\n"
- "ldr b28, [x25, #0x0]\n"
- "ldr b21, [x24, #0x0]\n"
- "ldr b27, [x23, #0x0]\n"
- "ldr b20, [x22, #0x0]\n"
- "ldr b26, [x21, #0x0]\n"
+ "ldr b28, [x26, #0x0]\n"
+ "ldr b27, [x25, #0x0]\n"
+ "ldr b26, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b24, [x22, #0x0]\n"
+ "ldr b23, [x21, #0x0]\n"
"7:" // Odd load end
- "sshll v25.8h, v25.8b, #0x0\n"
"sshll v30.8h, v30.8b, #0x0\n"
- "subs x20, x20, #0x1\n"
"sshll v29.8h, v29.8b, #0x0\n"
+ "subs x20, x20, #0x1\n"
"sshll v28.8h, v28.8b, #0x0\n"
- "sshll v21.8h, v21.8b, #0x0\n"
"sshll v27.8h, v27.8b, #0x0\n"
- "sshll v20.8h, v20.8b, #0x0\n"
"sshll v26.8h, v26.8b, #0x0\n"
- "zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v22.8h, v29.8h, v20.8h\n"
- "zip1 v19.8h, v30.8h, v27.8h\n"
- "zip1 v18.8h, v28.8h, v26.8h\n"
- "zip1 v24.8h, v23.8h, v22.8h\n"
- "zip1 v17.8h, v19.8h, v18.8h\n"
- "zip1 v16.8h, v24.8h, v17.8h\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "sshll v24.8h, v24.8b, #0x0\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "zip1 v22.8h, v30.8h, v26.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v24.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v23.8h, v23.8h, v22.8h\n"
- "zip2 v19.8h, v19.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
- "zip1 v17.8h, v23.8h, v19.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v26.8h\n"
- "zip1 v22.8h, v25.8h, v21.8h\n"
- "zip1 v18.8h, v20.8h, v16.8h\n"
- "zip1 v19.8h, v22.8h, v18.8h\n"
- "str q19, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v18.8h, v22.8h, v18.8h\n"
- "str q18, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v21.8h, v25.8h, v21.8h\n"
- "zip2 v20.8h, v20.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
index 3ad103c8d4..9ac7053ad8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -167,109 +167,109 @@ void interleave_block<8, 1, VLType::None, true>(
"5:" // Main loop skip
"cbz %x[width], 10f\n"
"tbz %x[width], #2, 7f\n"
- "ldr s31, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
- "ldr s24, [x21], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
"mov x20, #0x6\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v26.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v26.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v26.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
- "ldr h31, [x28], #0x2\n"
- "ldr h30, [x27], #0x2\n"
+ "ldr h30, [x28], #0x2\n"
+ "ldr h29, [x27], #0x2\n"
"mov x20, #0x2\n"
- "ldr h29, [x26], #0x2\n"
- "ldr h28, [x25], #0x2\n"
- "ldr h27, [x24], #0x2\n"
- "ldr h26, [x23], #0x2\n"
- "ldr h25, [x22], #0x2\n"
- "ldr h24, [x21], #0x2\n"
+ "ldr h28, [x26], #0x2\n"
+ "ldr h27, [x25], #0x2\n"
+ "ldr h26, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h24, [x22], #0x2\n"
+ "ldr h23, [x21], #0x2\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v26.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
"b 9f\n"
"8:" // odd_loads_1_0
- "ldr b31, [x28, #0x0]\n"
- "ldr b30, [x27, #0x0]\n"
+ "ldr b30, [x28, #0x0]\n"
+ "ldr b29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b29, [x26, #0x0]\n"
- "ldr b28, [x25, #0x0]\n"
- "ldr b27, [x24, #0x0]\n"
- "ldr b26, [x23, #0x0]\n"
- "ldr b25, [x22, #0x0]\n"
- "ldr b24, [x21, #0x0]\n"
+ "ldr b28, [x26, #0x0]\n"
+ "ldr b27, [x25, #0x0]\n"
+ "ldr b26, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b24, [x22, #0x0]\n"
+ "ldr b23, [x21, #0x0]\n"
"9:" // Odd load end
- "sshll v31.8h, v31.8b, #0x0\n"
"sshll v30.8h, v30.8b, #0x0\n"
- "subs x20, x20, #0x1\n"
"sshll v29.8h, v29.8b, #0x0\n"
+ "subs x20, x20, #0x1\n"
"sshll v28.8h, v28.8b, #0x0\n"
"sshll v27.8h, v27.8b, #0x0\n"
"sshll v26.8h, v26.8b, #0x0\n"
"sshll v25.8h, v25.8b, #0x0\n"
"sshll v24.8h, v24.8b, #0x0\n"
- "zip1 v23.8h, v31.8h, v27.8h\n"
- "zip1 v22.8h, v29.8h, v25.8h\n"
- "zip1 v21.8h, v30.8h, v26.8h\n"
- "zip1 v20.8h, v28.8h, v24.8h\n"
- "zip1 v18.8h, v23.8h, v22.8h\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "zip1 v22.8h, v30.8h, v26.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v19.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"subs x20, x20, #0x1\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "add v2.8h, v2.8h, v19.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v23.8h, v22.8h\n"
- "zip2 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -282,11 +282,11 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v22.8h, v31.8h, v27.8h\n"
- "zip2 v21.8h, v29.8h, v25.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v26.8h\n"
- "zip2 v19.8h, v28.8h, v24.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
"zip1 v18.8h, v22.8h, v21.8h\n"
"zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
@@ -300,9 +300,9 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v22.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
index de29d77a22..c01d980f49 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -159,101 +159,101 @@ void interleave_block<8, 1, VLType::None, true>(
"5:" // Main loop skip
"cbz %x[width], 10f\n"
"tbz %x[width], #2, 7f\n"
- "ldr d31, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ldr d27, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "ldr d29, [x27], #0x8\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
"ldr d24, [x22], #0x8\n"
"ldr d23, [x21], #0x8\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v31.s }[2], [x28], #0x4\n"
- "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v30.s }[2], [x28], #0x4\n"
+ "ld1 { v29.s }[2], [x27], #0x4\n"
"mov x20, #0x6\n"
- "ld1 { v29.s }[2], [x26], #0x4\n"
- "ld1 { v28.s }[2], [x25], #0x4\n"
- "ld1 { v27.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v28.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x24], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
"ld1 { v24.s }[2], [x22], #0x4\n"
"ld1 { v23.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.h }[6], [x28]\n"
- "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v30.h }[6], [x28]\n"
+ "ld1 { v29.h }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.h }[6], [x26]\n"
- "ld1 { v28.h }[6], [x25]\n"
- "ld1 { v27.h }[6], [x24]\n"
- "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v28.h }[6], [x26]\n"
+ "ld1 { v27.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x24]\n"
+ "ld1 { v25.h }[6], [x23]\n"
"ld1 { v24.h }[6], [x22]\n"
"ld1 { v23.h }[6], [x21]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.h }[4], [x28]\n"
- "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v30.h }[4], [x28]\n"
+ "ld1 { v29.h }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.h }[4], [x26]\n"
- "ld1 { v28.h }[4], [x25]\n"
- "ld1 { v27.h }[4], [x24]\n"
- "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v28.h }[4], [x26]\n"
+ "ld1 { v27.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x24]\n"
+ "ld1 { v25.h }[4], [x23]\n"
"ld1 { v24.h }[4], [x22]\n"
"ld1 { v23.h }[4], [x21]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
- "ldr s31, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
"mov x20, #0x2\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
"ldr s24, [x22], #0x4\n"
"ldr s23, [x21], #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.h }[2], [x28]\n"
- "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v30.h }[2], [x28]\n"
+ "ld1 { v29.h }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.h }[2], [x26]\n"
- "ld1 { v28.h }[2], [x25]\n"
- "ld1 { v27.h }[2], [x24]\n"
- "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v28.h }[2], [x26]\n"
+ "ld1 { v27.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x24]\n"
+ "ld1 { v25.h }[2], [x23]\n"
"ld1 { v24.h }[2], [x22]\n"
"ld1 { v23.h }[2], [x21]\n"
"b 9f\n"
"8:" // odd_loads_1_0
- "ldr h31, [x28, #0x0]\n"
- "ldr h30, [x27, #0x0]\n"
+ "ldr h30, [x28, #0x0]\n"
+ "ldr h29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h29, [x26, #0x0]\n"
- "ldr h28, [x25, #0x0]\n"
- "ldr h27, [x24, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
+ "ldr h28, [x26, #0x0]\n"
+ "ldr h27, [x25, #0x0]\n"
+ "ldr h26, [x24, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
"ldr h24, [x22, #0x0]\n"
"ldr h23, [x21, #0x0]\n"
"9:" // Odd load end
- "zip1 v25.8h, v31.8h, v27.8h\n"
- "zip1 v18.8h, v29.8h, v24.8h\n"
- "subs x20, x20, #0x1\n"
"zip1 v22.8h, v30.8h, v26.8h\n"
- "zip1 v21.8h, v28.8h, v23.8h\n"
- "zip1 v17.8h, v25.8h, v18.8h\n"
- "zip1 v16.8h, v22.8h, v21.8h\n"
- "zip1 v20.8h, v17.8h, v16.8h\n"
- "str q20, [%x[out_ptr], #0x0]\n"
- "add v2.8h, v2.8h, v20.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "subs x20, x20, #0x1\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v19.8h, v17.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"subs x20, x20, #0x1\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "add v2.8h, v2.8h, v19.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v25.8h, v18.8h\n"
- "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -266,11 +266,11 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v22.8h, v31.8h, v27.8h\n"
- "zip2 v21.8h, v29.8h, v24.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v26.8h\n"
- "zip2 v19.8h, v28.8h, v23.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
"zip1 v18.8h, v22.8h, v21.8h\n"
"zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
@@ -284,9 +284,9 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v22.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
index 43a3a46801..d29a995b46 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -80,35 +80,35 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr d25, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
"ldr d21, [x24], #0x8\n"
- "ldr d27, [x23], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
"ushll v21.8h, v21.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
"ushll v20.8h, v20.8b, #0x0\n"
- "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d16, [x21], #0x8\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
"zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v22.8h, v29.8h, v20.8h\n"
+ "zip1 v22.8h, v26.8h, v17.8h\n"
"subs %x[width], %x[width], #0x8\n"
"cmp %x[width], #0x8\n"
- "zip1 v19.8h, v30.8h, v27.8h\n"
- "zip1 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v19.8h, v27.8h, v20.8h\n"
+ "zip1 v18.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v21.8h, v26.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v26.8h\n"
+ "zip2 v20.8h, v27.8h, v20.8h\n"
+ "zip2 v16.8h, v24.8h, v16.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
"zip1 v24.8h, v23.8h, v22.8h\n"
@@ -142,140 +142,139 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr s25, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s21, [x24], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v25.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
"mov x20, #0x6\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v21.h }[2], [x24], #0x2\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "ld1 { v20.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v21.b }[6], [x24]\n"
- "ld1 { v27.b }[6], [x23]\n"
- "ld1 { v20.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v21.b }[4], [x24]\n"
- "ld1 { v27.b }[4], [x23]\n"
- "ld1 { v20.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr h25, [x28], #0x2\n"
- "ldr h30, [x27], #0x2\n"
+ "ldr h30, [x28], #0x2\n"
+ "ldr h29, [x27], #0x2\n"
"mov x20, #0x2\n"
- "ldr h29, [x26], #0x2\n"
- "ldr h28, [x25], #0x2\n"
- "ldr h21, [x24], #0x2\n"
- "ldr h27, [x23], #0x2\n"
- "ldr h20, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
+ "ldr h28, [x26], #0x2\n"
+ "ldr h27, [x25], #0x2\n"
+ "ldr h26, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h24, [x22], #0x2\n"
+ "ldr h23, [x21], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v25.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v21.b }[2], [x24]\n"
- "ld1 { v27.b }[2], [x23]\n"
- "ld1 { v20.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr b25, [x28, #0x0]\n"
- "ldr b30, [x27, #0x0]\n"
+ "ldr b30, [x28, #0x0]\n"
+ "ldr b29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b29, [x26, #0x0]\n"
- "ldr b28, [x25, #0x0]\n"
- "ldr b21, [x24, #0x0]\n"
- "ldr b27, [x23, #0x0]\n"
- "ldr b20, [x22, #0x0]\n"
- "ldr b26, [x21, #0x0]\n"
+ "ldr b28, [x26, #0x0]\n"
+ "ldr b27, [x25, #0x0]\n"
+ "ldr b26, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b24, [x22, #0x0]\n"
+ "ldr b23, [x21, #0x0]\n"
"7:" // Odd load end
- "ushll v25.8h, v25.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "subs x20, x20, #0x1\n"
"ushll v29.8h, v29.8b, #0x0\n"
+ "subs x20, x20, #0x1\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ushll v21.8h, v21.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ushll v20.8h, v20.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "zip1 v23.8h, v25.8h, v21.8h\n"
- "zip1 v22.8h, v29.8h, v20.8h\n"
- "zip1 v19.8h, v30.8h, v27.8h\n"
- "zip1 v18.8h, v28.8h, v26.8h\n"
- "zip1 v24.8h, v23.8h, v22.8h\n"
- "zip1 v17.8h, v19.8h, v18.8h\n"
- "zip1 v16.8h, v24.8h, v17.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "zip1 v22.8h, v30.8h, v26.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v24.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v23.8h, v23.8h, v22.8h\n"
- "zip2 v19.8h, v19.8h, v18.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
- "zip1 v17.8h, v23.8h, v19.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v25.8h, v25.8h, v21.8h\n"
- "zip2 v21.8h, v29.8h, v20.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v27.8h\n"
- "zip2 v16.8h, v28.8h, v26.8h\n"
- "zip1 v22.8h, v25.8h, v21.8h\n"
- "zip1 v18.8h, v20.8h, v16.8h\n"
- "zip1 v19.8h, v22.8h, v18.8h\n"
- "str q19, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v18.8h, v22.8h, v18.8h\n"
- "str q18, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v21.8h, v25.8h, v21.8h\n"
- "zip2 v20.8h, v20.8h, v16.8h\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
index 3ab24365af..ae4bf9bf3b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -167,109 +167,109 @@ void interleave_block<8, 1, VLType::None, true>(
"5:" // Main loop skip
"cbz %x[width], 10f\n"
"tbz %x[width], #2, 7f\n"
- "ldr s31, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
- "ldr s24, [x21], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
"mov x20, #0x6\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v26.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
"mov x20, #0x7\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v26.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
- "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x20, #0x4\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
"mov x20, #0x5\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v26.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
- "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
- "ldr h31, [x28], #0x2\n"
- "ldr h30, [x27], #0x2\n"
+ "ldr h30, [x28], #0x2\n"
+ "ldr h29, [x27], #0x2\n"
"mov x20, #0x2\n"
- "ldr h29, [x26], #0x2\n"
- "ldr h28, [x25], #0x2\n"
- "ldr h27, [x24], #0x2\n"
- "ldr h26, [x23], #0x2\n"
- "ldr h25, [x22], #0x2\n"
- "ldr h24, [x21], #0x2\n"
+ "ldr h28, [x26], #0x2\n"
+ "ldr h27, [x25], #0x2\n"
+ "ldr h26, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h24, [x22], #0x2\n"
+ "ldr h23, [x21], #0x2\n"
"tbz %x[width], #0, 9f\n"
- "ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v26.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
- "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
"b 9f\n"
"8:" // odd_loads_1_0
- "ldr b31, [x28, #0x0]\n"
- "ldr b30, [x27, #0x0]\n"
+ "ldr b30, [x28, #0x0]\n"
+ "ldr b29, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b29, [x26, #0x0]\n"
- "ldr b28, [x25, #0x0]\n"
- "ldr b27, [x24, #0x0]\n"
- "ldr b26, [x23, #0x0]\n"
- "ldr b25, [x22, #0x0]\n"
- "ldr b24, [x21, #0x0]\n"
+ "ldr b28, [x26, #0x0]\n"
+ "ldr b27, [x25, #0x0]\n"
+ "ldr b26, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b24, [x22, #0x0]\n"
+ "ldr b23, [x21, #0x0]\n"
"9:" // Odd load end
- "ushll v31.8h, v31.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "subs x20, x20, #0x1\n"
"ushll v29.8h, v29.8b, #0x0\n"
+ "subs x20, x20, #0x1\n"
"ushll v28.8h, v28.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
"ushll v25.8h, v25.8b, #0x0\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "zip1 v23.8h, v31.8h, v27.8h\n"
- "zip1 v22.8h, v29.8h, v25.8h\n"
- "zip1 v21.8h, v30.8h, v26.8h\n"
- "zip1 v20.8h, v28.8h, v24.8h\n"
- "zip1 v18.8h, v23.8h, v22.8h\n"
- "zip1 v17.8h, v21.8h, v20.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "zip1 v22.8h, v30.8h, v26.8h\n"
+ "zip1 v21.8h, v28.8h, v24.8h\n"
+ "zip1 v20.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v18.8h, v22.8h, v21.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v19.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"subs x20, x20, #0x1\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "add v2.8h, v2.8h, v19.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v23.8h, v22.8h\n"
- "zip2 v17.8h, v21.8h, v20.8h\n"
+ "zip2 v18.8h, v22.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
"subs x20, x20, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -282,11 +282,11 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v22.8h, v31.8h, v27.8h\n"
- "zip2 v21.8h, v29.8h, v25.8h\n"
+ "zip2 v22.8h, v30.8h, v26.8h\n"
+ "zip2 v21.8h, v28.8h, v24.8h\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.8h, v30.8h, v26.8h\n"
- "zip2 v19.8h, v28.8h, v24.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
"zip1 v18.8h, v22.8h, v21.8h\n"
"zip1 v17.8h, v20.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
@@ -300,9 +300,9 @@ void interleave_block<8, 1, VLType::None, true>(
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v22.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v21.8h\n"
+ "zip2 v16.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v17.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v2.8h, v2.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
index d4d150456f..43d9d20c10 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -79,29 +79,29 @@ void interleave_block<8, 2, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q28, [x28], #0x10\n"
- "ldr q27, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q18, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
"cmp %x[width], #0x8\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip1 v26.4s, v28.4s, v22.4s\n"
- "zip1 v25.4s, v27.4s, v21.4s\n"
- "ldr q24, [x24], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "ldr q16, [x25], #0x10\n"
+ "zip1 v25.4s, v20.4s, v17.4s\n"
+ "zip1 v24.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x24], #0x10\n"
"ldr q23, [x23], #0x10\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "ldr q19, [x22], #0x10\n"
- "ldr q18, [x21], #0x10\n"
- "zip1 v20.4s, v24.4s, v19.4s\n"
- "zip1 v17.4s, v23.4s, v18.4s\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v18.4s, v23.4s, v18.4s\n"
+ "zip2 v22.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x22], #0x10\n"
+ "ldr q16, [x21], #0x10\n"
+ "zip1 v20.4s, v19.4s, v18.4s\n"
+ "zip1 v17.4s, v23.4s, v16.4s\n"
+ "zip2 v19.4s, v19.4s, v18.4s\n"
+ "zip2 v18.4s, v23.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v16.4s, v26.4s, v25.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
@@ -109,7 +109,7 @@ void interleave_block<8, 2, VLType::None, false>(
"str q16, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip2 v16.4s, v26.4s, v25.4s\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
"str q16, [%x[out_ptr], #0x20]\n"
"zip2 v16.4s, v20.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x30]\n"
@@ -128,32 +128,32 @@ void interleave_block<8, 2, VLType::None, false>(
"tbz %x[width], #2, 5f\n"
"ldr d28, [x28], #0x8\n"
"ldr d27, [x27], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
"ldr d24, [x24], #0x8\n"
"ldr d23, [x23], #0x8\n"
- "ldr d19, [x22], #0x8\n"
- "ldr d18, [x21], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v28.s }[2], [x28], #0x4\n"
"ld1 { v27.s }[2], [x27], #0x4\n"
"mov x20, #0x3\n"
- "ld1 { v22.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
"ld1 { v24.s }[2], [x24], #0x4\n"
"ld1 { v23.s }[2], [x23], #0x4\n"
- "ld1 { v19.s }[2], [x22], #0x4\n"
- "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v21.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v28.h }[6], [x28]\n"
"ld1 { v27.h }[6], [x27]\n"
"mov x20, #0x4\n"
- "ld1 { v22.h }[6], [x26]\n"
- "ld1 { v21.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x26]\n"
+ "ld1 { v25.h }[6], [x25]\n"
"ld1 { v24.h }[6], [x24]\n"
"ld1 { v23.h }[6], [x23]\n"
- "ld1 { v19.h }[6], [x22]\n"
- "ld1 { v18.h }[6], [x21]\n"
+ "ld1 { v22.h }[6], [x22]\n"
+ "ld1 { v21.h }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x2\n"
@@ -161,82 +161,81 @@ void interleave_block<8, 2, VLType::None, false>(
"ld1 { v28.h }[4], [x28]\n"
"ld1 { v27.h }[4], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v22.h }[4], [x26]\n"
- "ld1 { v21.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x26]\n"
+ "ld1 { v25.h }[4], [x25]\n"
"ld1 { v24.h }[4], [x24]\n"
"ld1 { v23.h }[4], [x23]\n"
- "ld1 { v19.h }[4], [x22]\n"
- "ld1 { v18.h }[4], [x21]\n"
+ "ld1 { v22.h }[4], [x22]\n"
+ "ld1 { v21.h }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
"ldr s28, [x28], #0x4\n"
"ldr s27, [x27], #0x4\n"
"mov x20, #0x1\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
"ldr s24, [x24], #0x4\n"
"ldr s23, [x23], #0x4\n"
- "ldr s19, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v28.h }[2], [x28]\n"
"ld1 { v27.h }[2], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v22.h }[2], [x26]\n"
- "ld1 { v21.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
"ld1 { v24.h }[2], [x24]\n"
"ld1 { v23.h }[2], [x23]\n"
- "ld1 { v19.h }[2], [x22]\n"
- "ld1 { v18.h }[2], [x21]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v21.h }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
"ldr h28, [x28, #0x0]\n"
"ldr h27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h22, [x26, #0x0]\n"
- "ldr h21, [x25, #0x0]\n"
+ "ldr h26, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
"ldr h24, [x24, #0x0]\n"
"ldr h23, [x23, #0x0]\n"
- "ldr h19, [x22, #0x0]\n"
- "ldr h18, [x21, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h21, [x21, #0x0]\n"
"7:" // Odd load end
- "zip1 v26.4s, v28.4s, v22.4s\n"
- "zip1 v25.4s, v27.4s, v21.4s\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v20.4s, v24.4s, v19.4s\n"
- "zip1 v17.4s, v23.4s, v18.4s\n"
- "zip1 v16.4s, v26.4s, v25.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v16.4s, v20.4s, v17.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.4s, v26.4s, v25.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.4s, v20.4s, v17.4s\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
+ "zip2 v20.4s, v28.4s, v26.4s\n"
+ "zip2 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v18.4s, v23.4s, v18.4s\n"
- "zip1 v16.4s, v22.4s, v21.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v17.4s, v22.4s, v21.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"8:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
index 358b83ad1b..3ec03370a0 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -79,18 +79,18 @@ void interleave_block<8, 2, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q26, [x28], #0x10\n"
- "ldr q21, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q19, [x27], #0x10\n"
"subs %x[width], %x[width], #0x4\n"
"cmp %x[width], #0x4\n"
"ldr q25, [x26], #0x10\n"
"ldr q24, [x25], #0x10\n"
- "zip1 v16.2d, v26.2d, v21.2d\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
"zip1 v18.2d, v25.2d, v24.2d\n"
"ldr q23, [x24], #0x10\n"
"ldr q22, [x23], #0x10\n"
"zip1 v17.2d, v23.2d, v22.2d\n"
- "zip2 v21.2d, v26.2d, v21.2d\n"
+ "zip2 v21.2d, v20.2d, v19.2d\n"
"ldr q20, [x22], #0x10\n"
"ldr q19, [x21], #0x10\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -118,62 +118,61 @@ void interleave_block<8, 2, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr d26, [x28], #0x8\n"
- "ldr d21, [x27], #0x8\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d24, [x27], #0x8\n"
"mov x20, #0x1\n"
- "ldr d25, [x26], #0x8\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d23, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v26.s }[2], [x28]\n"
- "ld1 { v21.s }[2], [x27]\n"
+ "ld1 { v25.s }[2], [x28]\n"
+ "ld1 { v24.s }[2], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v25.s }[2], [x26]\n"
- "ld1 { v24.s }[2], [x25]\n"
- "ld1 { v23.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "ld1 { v19.s }[2], [x21]\n"
+ "ld1 { v23.s }[2], [x26]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v18.s }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr s26, [x28, #0x0]\n"
- "ldr s21, [x27, #0x0]\n"
+ "ldr s25, [x28, #0x0]\n"
+ "ldr s24, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr s25, [x26, #0x0]\n"
- "ldr s24, [x25, #0x0]\n"
- "ldr s23, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "ldr s19, [x21, #0x0]\n"
+ "ldr s23, [x26, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s21, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s18, [x21, #0x0]\n"
"5:" // Odd load end
"subs x20, x20, #0x1\n"
- "zip1 v16.2d, v26.2d, v21.2d\n"
+ "zip1 v16.2d, v25.2d, v24.2d\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.2d, v25.2d, v24.2d\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
- "zip1 v16.2d, v20.2d, v19.2d\n"
+ "zip1 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v21.2d, v20.2d\n"
+ "zip1 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 6f\n"
- "zip2 v21.2d, v26.2d, v21.2d\n"
- "str q21, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v25.2d, v24.2d\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "zip2 v17.2d, v23.2d, v22.2d\n"
- "zip2 v16.2d, v20.2d, v19.2d\n"
+ "zip2 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.2d, v21.2d, v20.2d\n"
+ "zip2 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"6:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
index d606d5a5b6..e9799f87a9 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -79,18 +79,18 @@ void interleave_block<8, 4, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q26, [x28], #0x10\n"
- "ldr q21, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q19, [x27], #0x10\n"
"subs %x[width], %x[width], #0x8\n"
"cmp %x[width], #0x8\n"
"ldr q25, [x26], #0x10\n"
"ldr q24, [x25], #0x10\n"
- "zip1 v16.2d, v26.2d, v21.2d\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
"zip1 v18.2d, v25.2d, v24.2d\n"
"ldr q23, [x24], #0x10\n"
"ldr q22, [x23], #0x10\n"
"zip1 v17.2d, v23.2d, v22.2d\n"
- "zip2 v21.2d, v26.2d, v21.2d\n"
+ "zip2 v21.2d, v20.2d, v19.2d\n"
"ldr q20, [x22], #0x10\n"
"ldr q19, [x21], #0x10\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -118,104 +118,103 @@ void interleave_block<8, 4, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr d26, [x28], #0x8\n"
- "ldr d21, [x27], #0x8\n"
- "ldr d25, [x26], #0x8\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d24, [x27], #0x8\n"
+ "ldr d23, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v26.s }[2], [x28], #0x4\n"
- "ld1 { v21.s }[2], [x27], #0x4\n"
+ "ld1 { v25.s }[2], [x28], #0x4\n"
+ "ld1 { v24.s }[2], [x27], #0x4\n"
"mov x20, #0x2\n"
- "ld1 { v25.s }[2], [x26], #0x4\n"
- "ld1 { v24.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
- "ld1 { v20.s }[2], [x22], #0x4\n"
- "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v23.s }[2], [x26], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v26.h }[6], [x28]\n"
- "ld1 { v21.h }[6], [x27]\n"
- "ld1 { v25.h }[6], [x26]\n"
- "ld1 { v24.h }[6], [x25]\n"
- "ld1 { v23.h }[6], [x24]\n"
- "ld1 { v22.h }[6], [x23]\n"
- "ld1 { v20.h }[6], [x22]\n"
- "ld1 { v19.h }[6], [x21]\n"
+ "ld1 { v25.h }[6], [x28]\n"
+ "ld1 { v24.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x26]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v19.h }[6], [x22]\n"
+ "ld1 { v18.h }[6], [x21]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v26.h }[4], [x28]\n"
- "ld1 { v21.h }[4], [x27]\n"
+ "ld1 { v25.h }[4], [x28]\n"
+ "ld1 { v24.h }[4], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v25.h }[4], [x26]\n"
- "ld1 { v24.h }[4], [x25]\n"
- "ld1 { v23.h }[4], [x24]\n"
- "ld1 { v22.h }[4], [x23]\n"
- "ld1 { v20.h }[4], [x22]\n"
- "ld1 { v19.h }[4], [x21]\n"
+ "ld1 { v23.h }[4], [x26]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v19.h }[4], [x22]\n"
+ "ld1 { v18.h }[4], [x21]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr s26, [x28], #0x4\n"
- "ldr s21, [x27], #0x4\n"
+ "ldr s25, [x28], #0x4\n"
+ "ldr s24, [x27], #0x4\n"
"mov x20, #0x1\n"
- "ldr s25, [x26], #0x4\n"
- "ldr s24, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s22, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
+ "ldr s23, [x26], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v26.h }[2], [x28]\n"
- "ld1 { v21.h }[2], [x27]\n"
- "ld1 { v25.h }[2], [x26]\n"
- "ld1 { v24.h }[2], [x25]\n"
- "ld1 { v23.h }[2], [x24]\n"
- "ld1 { v22.h }[2], [x23]\n"
- "ld1 { v20.h }[2], [x22]\n"
- "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v25.h }[2], [x28]\n"
+ "ld1 { v24.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x26]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v19.h }[2], [x22]\n"
+ "ld1 { v18.h }[2], [x21]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr h26, [x28, #0x0]\n"
- "ldr h21, [x27, #0x0]\n"
+ "ldr h25, [x28, #0x0]\n"
+ "ldr h24, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr h25, [x26, #0x0]\n"
- "ldr h24, [x25, #0x0]\n"
- "ldr h23, [x24, #0x0]\n"
- "ldr h22, [x23, #0x0]\n"
- "ldr h20, [x22, #0x0]\n"
- "ldr h19, [x21, #0x0]\n"
+ "ldr h23, [x26, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "ldr h21, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h19, [x22, #0x0]\n"
+ "ldr h18, [x21, #0x0]\n"
"7:" // Odd load end
"subs x20, x20, #0x1\n"
- "zip1 v16.2d, v26.2d, v21.2d\n"
+ "zip1 v16.2d, v25.2d, v24.2d\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.2d, v25.2d, v24.2d\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
- "zip1 v16.2d, v20.2d, v19.2d\n"
+ "zip1 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v21.2d, v20.2d\n"
+ "zip1 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 8f\n"
- "zip2 v21.2d, v26.2d, v21.2d\n"
- "str q21, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v25.2d, v24.2d\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "zip2 v17.2d, v23.2d, v22.2d\n"
- "zip2 v16.2d, v20.2d, v19.2d\n"
+ "zip2 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.2d, v21.2d, v20.2d\n"
+ "zip2 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"8:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
index dfec14358b..730bfd6342 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
@@ -79,14 +79,14 @@ void interleave_block<8, 4, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q23, [x28], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
- ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n"
- "ldr q21, [x24], #0x10\n"
- "ldr q20, [x22], #0x10\n"
- ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n"
- ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
+ "ldr q17, [x28], #0x10\n"
+ "ldr q16, [x26], #0x10\n"
+ ".inst 0x0ea16a37 // bfcvtn v23.4h, v17.4s\n"
+ ".inst 0x0ea16a16 // bfcvtn v22.4h, v16.4s\n"
+ "ldr q17, [x24], #0x10\n"
+ "ldr q16, [x22], #0x10\n"
+ ".inst 0x0ea16a35 // bfcvtn v21.4h, v17.4s\n"
+ ".inst 0x0ea16a14 // bfcvtn v20.4h, v16.4s\n"
"ldr q19, [x27], #0x10\n"
"ldr q18, [x25], #0x10\n"
"subs %x[width], %x[width], #0x4\n"
@@ -114,51 +114,50 @@ void interleave_block<8, 4, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr d23, [x28], #0x8\n"
- "ldr d19, [x27], #0x8\n"
+ "ldr d19, [x28], #0x8\n"
+ "ldr d23, [x27], #0x8\n"
"mov x20, #0x1\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
- "ldr d21, [x24], #0x8\n"
- "ldr d17, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d16, [x21], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v23.s }[2], [x28]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x26]\n"
- "ld1 { v18.s }[2], [x25]\n"
- "ld1 { v21.s }[2], [x24]\n"
- "ld1 { v17.s }[2], [x23]\n"
- "ld1 { v20.s }[2], [x22]\n"
- "ld1 { v16.s }[2], [x21]\n"
+ "ld1 { v19.s }[2], [x28]\n"
+ "ld1 { v23.s }[2], [x27]\n"
+ "ld1 { v18.s }[2], [x26]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr s23, [x28, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
+ "ldr s19, [x28, #0x0]\n"
+ "ldr s23, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr s22, [x26, #0x0]\n"
- "ldr s18, [x25, #0x0]\n"
- "ldr s21, [x24, #0x0]\n"
- "ldr s17, [x23, #0x0]\n"
- "ldr s20, [x22, #0x0]\n"
- "ldr s16, [x21, #0x0]\n"
+ "ldr s18, [x26, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
"5:" // Odd load end
- ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
- ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n"
- ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n"
- ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
- ".inst 0x4ea16a77 // bfcvtn2 v23.8h, v19.4s\n"
- ".inst 0x4ea16a56 // bfcvtn2 v22.8h, v18.4s\n"
- "str q23, [%x[out_ptr], #0x0]\n"
- ".inst 0x4ea16a35 // bfcvtn2 v21.8h, v17.4s\n"
- ".inst 0x4ea16a14 // bfcvtn2 v20.8h, v16.4s\n"
- "str q22, [%x[out_ptr], #0x10]\n"
- "str q21, [%x[out_ptr], #0x20]\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
+ ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n"
+ ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ ".inst 0x4ea16af3 // bfcvtn2 v19.8h, v23.4s\n"
+ ".inst 0x4ea16ad2 // bfcvtn2 v18.8h, v22.4s\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ ".inst 0x4ea16ab1 // bfcvtn2 v17.8h, v21.4s\n"
+ ".inst 0x4ea16a90 // bfcvtn2 v16.8h, v20.4s\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"6:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
index 54f15f8a5c..15d8ddbe53 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -79,29 +79,29 @@ void interleave_block<8, 4, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q28, [x28], #0x10\n"
- "ldr q27, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q18, [x27], #0x10\n"
"subs %x[width], %x[width], #0x10\n"
"cmp %x[width], #0x10\n"
- "ldr q22, [x26], #0x10\n"
- "ldr q21, [x25], #0x10\n"
- "zip1 v26.4s, v28.4s, v22.4s\n"
- "zip1 v25.4s, v27.4s, v21.4s\n"
- "ldr q24, [x24], #0x10\n"
+ "ldr q17, [x26], #0x10\n"
+ "ldr q16, [x25], #0x10\n"
+ "zip1 v25.4s, v20.4s, v17.4s\n"
+ "zip1 v24.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x24], #0x10\n"
"ldr q23, [x23], #0x10\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
- "ldr q19, [x22], #0x10\n"
- "ldr q18, [x21], #0x10\n"
- "zip1 v20.4s, v24.4s, v19.4s\n"
- "zip1 v17.4s, v23.4s, v18.4s\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v18.4s, v23.4s, v18.4s\n"
+ "zip2 v22.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v18.4s, v16.4s\n"
+ "ldr q18, [x22], #0x10\n"
+ "ldr q16, [x21], #0x10\n"
+ "zip1 v20.4s, v19.4s, v18.4s\n"
+ "zip1 v17.4s, v23.4s, v16.4s\n"
+ "zip2 v19.4s, v19.4s, v18.4s\n"
+ "zip2 v18.4s, v23.4s, v16.4s\n"
"prfm pldl1keep, [x28, #0x70]\n"
"prfm pldl1keep, [x27, #0x70]\n"
"prfm pldl1keep, [x26, #0x70]\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "zip1 v16.4s, v26.4s, v25.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
"prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
@@ -109,7 +109,7 @@ void interleave_block<8, 4, VLType::None, false>(
"str q16, [%x[out_ptr], #0x10]\n"
"prfm pldl1keep, [x22, #0x70]\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip2 v16.4s, v26.4s, v25.4s\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
"str q16, [%x[out_ptr], #0x20]\n"
"zip2 v16.4s, v20.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x30]\n"
@@ -128,40 +128,40 @@ void interleave_block<8, 4, VLType::None, false>(
"tbz %x[width], #3, 7f\n"
"ldr d28, [x28], #0x8\n"
"ldr d27, [x27], #0x8\n"
- "ldr d22, [x26], #0x8\n"
- "ldr d21, [x25], #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
"ldr d24, [x24], #0x8\n"
"ldr d23, [x23], #0x8\n"
- "ldr d19, [x22], #0x8\n"
- "ldr d18, [x21], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
"tbz %x[width], #2, 5f\n"
"ld1 { v28.s }[2], [x28], #0x4\n"
"ld1 { v27.s }[2], [x27], #0x4\n"
- "ld1 { v22.s }[2], [x26], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
"ld1 { v24.s }[2], [x24], #0x4\n"
"ld1 { v23.s }[2], [x23], #0x4\n"
- "ld1 { v19.s }[2], [x22], #0x4\n"
- "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v21.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v28.h }[6], [x28], #0x2\n"
"ld1 { v27.h }[6], [x27], #0x2\n"
"mov x20, #0x4\n"
- "ld1 { v22.h }[6], [x26], #0x2\n"
- "ld1 { v21.h }[6], [x25], #0x2\n"
+ "ld1 { v26.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
"ld1 { v24.h }[6], [x24], #0x2\n"
"ld1 { v23.h }[6], [x23], #0x2\n"
- "ld1 { v19.h }[6], [x22], #0x2\n"
- "ld1 { v18.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x22], #0x2\n"
+ "ld1 { v21.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[14], [x28]\n"
"ld1 { v27.b }[14], [x27]\n"
- "ld1 { v22.b }[14], [x26]\n"
- "ld1 { v21.b }[14], [x25]\n"
+ "ld1 { v26.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
"ld1 { v24.b }[14], [x24]\n"
"ld1 { v23.b }[14], [x23]\n"
- "ld1 { v19.b }[14], [x22]\n"
- "ld1 { v18.b }[14], [x21]\n"
+ "ld1 { v22.b }[14], [x22]\n"
+ "ld1 { v21.b }[14], [x21]\n"
"b 11f\n"
"4:" // odd_loads_1_12
"mov x20, #0x3\n"
@@ -169,33 +169,33 @@ void interleave_block<8, 4, VLType::None, false>(
"ld1 { v28.b }[12], [x28]\n"
"ld1 { v27.b }[12], [x27]\n"
"mov x20, #0x4\n"
- "ld1 { v22.b }[12], [x26]\n"
- "ld1 { v21.b }[12], [x25]\n"
+ "ld1 { v26.b }[12], [x26]\n"
+ "ld1 { v25.b }[12], [x25]\n"
"ld1 { v24.b }[12], [x24]\n"
"ld1 { v23.b }[12], [x23]\n"
- "ld1 { v19.b }[12], [x22]\n"
- "ld1 { v18.b }[12], [x21]\n"
+ "ld1 { v22.b }[12], [x22]\n"
+ "ld1 { v21.b }[12], [x21]\n"
"b 11f\n"
"5:" // odd_loads_2_8
"tbz %x[width], #1, 6f\n"
"ld1 { v28.h }[4], [x28], #0x2\n"
"ld1 { v27.h }[4], [x27], #0x2\n"
"mov x20, #0x3\n"
- "ld1 { v22.h }[4], [x26], #0x2\n"
- "ld1 { v21.h }[4], [x25], #0x2\n"
+ "ld1 { v26.h }[4], [x26], #0x2\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
"ld1 { v24.h }[4], [x24], #0x2\n"
"ld1 { v23.h }[4], [x23], #0x2\n"
- "ld1 { v19.h }[4], [x22], #0x2\n"
- "ld1 { v18.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x22], #0x2\n"
+ "ld1 { v21.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[10], [x28]\n"
"ld1 { v27.b }[10], [x27]\n"
- "ld1 { v22.b }[10], [x26]\n"
- "ld1 { v21.b }[10], [x25]\n"
+ "ld1 { v26.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
"ld1 { v24.b }[10], [x24]\n"
"ld1 { v23.b }[10], [x23]\n"
- "ld1 { v19.b }[10], [x22]\n"
- "ld1 { v18.b }[10], [x21]\n"
+ "ld1 { v22.b }[10], [x22]\n"
+ "ld1 { v21.b }[10], [x21]\n"
"b 11f\n"
"6:" // odd_loads_1_8
"mov x20, #0x2\n"
@@ -203,42 +203,42 @@ void interleave_block<8, 4, VLType::None, false>(
"ld1 { v28.b }[8], [x28]\n"
"ld1 { v27.b }[8], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v22.b }[8], [x26]\n"
- "ld1 { v21.b }[8], [x25]\n"
+ "ld1 { v26.b }[8], [x26]\n"
+ "ld1 { v25.b }[8], [x25]\n"
"ld1 { v24.b }[8], [x24]\n"
"ld1 { v23.b }[8], [x23]\n"
- "ld1 { v19.b }[8], [x22]\n"
- "ld1 { v18.b }[8], [x21]\n"
+ "ld1 { v22.b }[8], [x22]\n"
+ "ld1 { v21.b }[8], [x21]\n"
"b 11f\n"
"7:" // odd_loads_4_0
"tbz %x[width], #2, 9f\n"
"ldr s28, [x28], #0x4\n"
"ldr s27, [x27], #0x4\n"
- "ldr s22, [x26], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "ldr s26, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
"ldr s24, [x24], #0x4\n"
"ldr s23, [x23], #0x4\n"
- "ldr s19, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
"tbz %x[width], #1, 8f\n"
"ld1 { v28.h }[2], [x28], #0x2\n"
"ld1 { v27.h }[2], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v22.h }[2], [x26], #0x2\n"
- "ld1 { v21.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
"ld1 { v24.h }[2], [x24], #0x2\n"
"ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v19.h }[2], [x22], #0x2\n"
- "ld1 { v18.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x22], #0x2\n"
+ "ld1 { v21.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[6], [x28]\n"
"ld1 { v27.b }[6], [x27]\n"
- "ld1 { v22.b }[6], [x26]\n"
- "ld1 { v21.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
"ld1 { v24.b }[6], [x24]\n"
"ld1 { v23.b }[6], [x23]\n"
- "ld1 { v19.b }[6], [x22]\n"
- "ld1 { v18.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x22]\n"
+ "ld1 { v21.b }[6], [x21]\n"
"b 11f\n"
"8:" // odd_loads_1_4
"mov x20, #0x1\n"
@@ -246,81 +246,80 @@ void interleave_block<8, 4, VLType::None, false>(
"ld1 { v28.b }[4], [x28]\n"
"ld1 { v27.b }[4], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v22.b }[4], [x26]\n"
- "ld1 { v21.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x26]\n"
+ "ld1 { v25.b }[4], [x25]\n"
"ld1 { v24.b }[4], [x24]\n"
"ld1 { v23.b }[4], [x23]\n"
- "ld1 { v19.b }[4], [x22]\n"
- "ld1 { v18.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x22]\n"
+ "ld1 { v21.b }[4], [x21]\n"
"b 11f\n"
"9:" // odd_loads_2_0
"tbz %x[width], #1, 10f\n"
"ldr h28, [x28], #0x2\n"
"ldr h27, [x27], #0x2\n"
"mov x20, #0x1\n"
- "ldr h22, [x26], #0x2\n"
- "ldr h21, [x25], #0x2\n"
+ "ldr h26, [x26], #0x2\n"
+ "ldr h25, [x25], #0x2\n"
"ldr h24, [x24], #0x2\n"
"ldr h23, [x23], #0x2\n"
- "ldr h19, [x22], #0x2\n"
- "ldr h18, [x21], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
+ "ldr h21, [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[2], [x28]\n"
"ld1 { v27.b }[2], [x27]\n"
- "ld1 { v22.b }[2], [x26]\n"
- "ld1 { v21.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
"ld1 { v24.b }[2], [x24]\n"
"ld1 { v23.b }[2], [x23]\n"
- "ld1 { v19.b }[2], [x22]\n"
- "ld1 { v18.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x22]\n"
+ "ld1 { v21.b }[2], [x21]\n"
"b 11f\n"
"10:" // odd_loads_1_0
"ldr b28, [x28, #0x0]\n"
"ldr b27, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b22, [x26, #0x0]\n"
- "ldr b21, [x25, #0x0]\n"
+ "ldr b26, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
"ldr b24, [x24, #0x0]\n"
"ldr b23, [x23, #0x0]\n"
- "ldr b19, [x22, #0x0]\n"
- "ldr b18, [x21, #0x0]\n"
+ "ldr b22, [x22, #0x0]\n"
+ "ldr b21, [x21, #0x0]\n"
"11:" // Odd load end
- "zip1 v26.4s, v28.4s, v22.4s\n"
- "zip1 v25.4s, v27.4s, v21.4s\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v20.4s, v24.4s, v19.4s\n"
- "zip1 v17.4s, v23.4s, v18.4s\n"
- "zip1 v16.4s, v26.4s, v25.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v16.4s, v20.4s, v17.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
"subs x20, x20, #0x1\n"
- "zip2 v16.4s, v26.4s, v25.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.4s, v20.4s, v17.4s\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v22.4s, v28.4s, v22.4s\n"
- "zip2 v21.4s, v27.4s, v21.4s\n"
+ "zip2 v20.4s, v28.4s, v26.4s\n"
+ "zip2 v19.4s, v27.4s, v25.4s\n"
"subs x20, x20, #0x1\n"
- "zip2 v19.4s, v24.4s, v19.4s\n"
- "zip2 v18.4s, v23.4s, v18.4s\n"
- "zip1 v16.4s, v22.4s, v21.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v17.4s, v22.4s, v21.4s\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"12:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
index 2db54126c0..6c41b5fdfb 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -153,202 +153,202 @@ void interleave_block<8, 4, VLType::None, true>(
"5:" // Main loop skip
"cbz %x[width], 14f\n"
"tbz %x[width], #3, 9f\n"
- "ldr d30, [x28], #0x8\n"
- "ldr d29, [x27], #0x8\n"
- "ldr d28, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d19, [x22], #0x8\n"
- "ldr d18, [x21], #0x8\n"
+ "ldr d29, [x28], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d27, [x26], #0x8\n"
+ "ldr d26, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[width], #2, 7f\n"
- "ld1 { v30.s }[2], [x28], #0x4\n"
- "ld1 { v29.s }[2], [x27], #0x4\n"
- "ld1 { v28.s }[2], [x26], #0x4\n"
- "ld1 { v27.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v19.s }[2], [x22], #0x4\n"
- "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v29.s }[2], [x28], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v27.s }[2], [x26], #0x4\n"
+ "ld1 { v26.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v30.h }[6], [x28], #0x2\n"
- "ld1 { v29.h }[6], [x27], #0x2\n"
+ "ld1 { v29.h }[6], [x28], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
"mov x20, #0x4\n"
- "ld1 { v28.h }[6], [x26], #0x2\n"
- "ld1 { v27.h }[6], [x25], #0x2\n"
- "ld1 { v20.h }[6], [x24], #0x2\n"
- "ld1 { v26.h }[6], [x23], #0x2\n"
- "ld1 { v19.h }[6], [x22], #0x2\n"
- "ld1 { v18.h }[6], [x21], #0x2\n"
+ "ld1 { v27.h }[6], [x26], #0x2\n"
+ "ld1 { v26.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v24.h }[6], [x23], #0x2\n"
+ "ld1 { v23.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[14], [x28]\n"
- "ld1 { v29.b }[14], [x27]\n"
- "ld1 { v28.b }[14], [x26]\n"
- "ld1 { v27.b }[14], [x25]\n"
- "ld1 { v20.b }[14], [x24]\n"
- "ld1 { v26.b }[14], [x23]\n"
- "ld1 { v19.b }[14], [x22]\n"
- "ld1 { v18.b }[14], [x21]\n"
+ "ld1 { v29.b }[14], [x28]\n"
+ "ld1 { v28.b }[14], [x27]\n"
+ "ld1 { v27.b }[14], [x26]\n"
+ "ld1 { v26.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v24.b }[14], [x23]\n"
+ "ld1 { v23.b }[14], [x22]\n"
+ "ld1 { v22.b }[14], [x21]\n"
"b 13f\n"
"6:" // odd_loads_1_12
"mov x20, #0x3\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[12], [x28]\n"
- "ld1 { v29.b }[12], [x27]\n"
+ "ld1 { v29.b }[12], [x28]\n"
+ "ld1 { v28.b }[12], [x27]\n"
"mov x20, #0x4\n"
- "ld1 { v28.b }[12], [x26]\n"
- "ld1 { v27.b }[12], [x25]\n"
- "ld1 { v20.b }[12], [x24]\n"
- "ld1 { v26.b }[12], [x23]\n"
- "ld1 { v19.b }[12], [x22]\n"
- "ld1 { v18.b }[12], [x21]\n"
+ "ld1 { v27.b }[12], [x26]\n"
+ "ld1 { v26.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v24.b }[12], [x23]\n"
+ "ld1 { v23.b }[12], [x22]\n"
+ "ld1 { v22.b }[12], [x21]\n"
"b 13f\n"
"7:" // odd_loads_2_8
"tbz %x[width], #1, 8f\n"
- "ld1 { v30.h }[4], [x28], #0x2\n"
- "ld1 { v29.h }[4], [x27], #0x2\n"
+ "ld1 { v29.h }[4], [x28], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
"mov x20, #0x3\n"
- "ld1 { v28.h }[4], [x26], #0x2\n"
- "ld1 { v27.h }[4], [x25], #0x2\n"
- "ld1 { v20.h }[4], [x24], #0x2\n"
- "ld1 { v26.h }[4], [x23], #0x2\n"
- "ld1 { v19.h }[4], [x22], #0x2\n"
- "ld1 { v18.h }[4], [x21], #0x2\n"
+ "ld1 { v27.h }[4], [x26], #0x2\n"
+ "ld1 { v26.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v24.h }[4], [x23], #0x2\n"
+ "ld1 { v23.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[10], [x28]\n"
- "ld1 { v29.b }[10], [x27]\n"
- "ld1 { v28.b }[10], [x26]\n"
- "ld1 { v27.b }[10], [x25]\n"
- "ld1 { v20.b }[10], [x24]\n"
- "ld1 { v26.b }[10], [x23]\n"
- "ld1 { v19.b }[10], [x22]\n"
- "ld1 { v18.b }[10], [x21]\n"
+ "ld1 { v29.b }[10], [x28]\n"
+ "ld1 { v28.b }[10], [x27]\n"
+ "ld1 { v27.b }[10], [x26]\n"
+ "ld1 { v26.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v24.b }[10], [x23]\n"
+ "ld1 { v23.b }[10], [x22]\n"
+ "ld1 { v22.b }[10], [x21]\n"
"b 13f\n"
"8:" // odd_loads_1_8
"mov x20, #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[8], [x28]\n"
- "ld1 { v29.b }[8], [x27]\n"
+ "ld1 { v29.b }[8], [x28]\n"
+ "ld1 { v28.b }[8], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v28.b }[8], [x26]\n"
- "ld1 { v27.b }[8], [x25]\n"
- "ld1 { v20.b }[8], [x24]\n"
- "ld1 { v26.b }[8], [x23]\n"
- "ld1 { v19.b }[8], [x22]\n"
- "ld1 { v18.b }[8], [x21]\n"
+ "ld1 { v27.b }[8], [x26]\n"
+ "ld1 { v26.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v24.b }[8], [x23]\n"
+ "ld1 { v23.b }[8], [x22]\n"
+ "ld1 { v22.b }[8], [x21]\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
- "ldr s30, [x28], #0x4\n"
- "ldr s29, [x27], #0x4\n"
- "ldr s28, [x26], #0x4\n"
- "ldr s27, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s19, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s29, [x28], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s27, [x26], #0x4\n"
+ "ldr s26, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"tbz %x[width], #1, 10f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x28], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v20.h }[2], [x24], #0x2\n"
- "ld1 { v26.h }[2], [x23], #0x2\n"
- "ld1 { v19.h }[2], [x22], #0x2\n"
- "ld1 { v18.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v26.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v20.b }[6], [x24]\n"
- "ld1 { v26.b }[6], [x23]\n"
- "ld1 { v19.b }[6], [x22]\n"
- "ld1 { v18.b }[6], [x21]\n"
+ "ld1 { v29.b }[6], [x28]\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v26.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v22.b }[6], [x21]\n"
"b 13f\n"
"10:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x28]\n"
+ "ld1 { v28.b }[4], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v20.b }[4], [x24]\n"
- "ld1 { v26.b }[4], [x23]\n"
- "ld1 { v19.b }[4], [x22]\n"
- "ld1 { v18.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v26.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v22.b }[4], [x21]\n"
"b 13f\n"
"11:" // odd_loads_2_0
"tbz %x[width], #1, 12f\n"
- "ldr h30, [x28], #0x2\n"
- "ldr h29, [x27], #0x2\n"
+ "ldr h29, [x28], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
"mov x20, #0x1\n"
- "ldr h28, [x26], #0x2\n"
- "ldr h27, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h26, [x23], #0x2\n"
- "ldr h19, [x22], #0x2\n"
- "ldr h18, [x21], #0x2\n"
+ "ldr h27, [x26], #0x2\n"
+ "ldr h26, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h24, [x23], #0x2\n"
+ "ldr h23, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v20.b }[2], [x24]\n"
- "ld1 { v26.b }[2], [x23]\n"
- "ld1 { v19.b }[2], [x22]\n"
- "ld1 { v18.b }[2], [x21]\n"
+ "ld1 { v29.b }[2], [x28]\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v26.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v22.b }[2], [x21]\n"
"b 13f\n"
"12:" // odd_loads_1_0
- "ldr b30, [x28, #0x0]\n"
- "ldr b29, [x27, #0x0]\n"
+ "ldr b29, [x28, #0x0]\n"
+ "ldr b28, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b28, [x26, #0x0]\n"
- "ldr b27, [x25, #0x0]\n"
- "ldr b20, [x24, #0x0]\n"
- "ldr b26, [x23, #0x0]\n"
- "ldr b19, [x22, #0x0]\n"
- "ldr b18, [x21, #0x0]\n"
+ "ldr b27, [x26, #0x0]\n"
+ "ldr b26, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b24, [x23, #0x0]\n"
+ "ldr b23, [x22, #0x0]\n"
+ "ldr b22, [x21, #0x0]\n"
"13:" // Odd load end
- "zip1 v22.4s, v30.4s, v28.4s\n"
"zip1 v21.4s, v29.4s, v27.4s\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v17.4s, v20.4s, v19.4s\n"
- "zip1 v16.4s, v26.4s, v18.4s\n"
- "zip1 v25.4s, v22.4s, v21.4s\n"
- "zip1 v24.4s, v17.4s, v16.4s\n"
- "str q25, [%x[out_ptr], #0x0]\n"
- "sadalp v2.8h, v25.16b\n"
- "str q24, [%x[out_ptr], #0x10]\n"
- "sadalp v1.8h, v24.16b\n"
+ "zip1 v19.4s, v25.4s, v23.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v21.4s, v20.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "sadalp v2.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "sadalp v1.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v23.4s, v22.4s, v21.4s\n"
- "zip2 v22.4s, v17.4s, v16.4s\n"
+ "zip2 v17.4s, v21.4s, v20.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"subs x20, x20, #0x1\n"
- "str q23, [%x[out_ptr], #0x0]\n"
- "sadalp v2.8h, v23.16b\n"
- "str q22, [%x[out_ptr], #0x10]\n"
- "sadalp v1.8h, v22.16b\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "sadalp v2.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "sadalp v1.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v21.4s, v30.4s, v28.4s\n"
- "zip2 v17.4s, v29.4s, v27.4s\n"
+ "zip2 v21.4s, v29.4s, v27.4s\n"
+ "zip2 v20.4s, v28.4s, v26.4s\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.4s, v20.4s, v19.4s\n"
- "zip2 v16.4s, v26.4s, v18.4s\n"
- "zip1 v19.4s, v21.4s, v17.4s\n"
- "zip1 v18.4s, v20.4s, v16.4s\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "sadalp v2.8h, v19.16b\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "sadalp v1.8h, v18.16b\n"
+ "zip2 v19.4s, v25.4s, v23.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v21.4s, v20.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "sadalp v2.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "sadalp v1.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v17.4s, v21.4s, v17.4s\n"
- "zip2 v16.4s, v20.4s, v16.4s\n"
+ "zip2 v17.4s, v21.4s, v20.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"str q17, [%x[out_ptr], #0x0]\n"
"sadalp v2.8h, v17.16b\n"
"str q16, [%x[out_ptr], #0x10]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
index 44a79c0f0a..17eb7d5556 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -153,202 +153,202 @@ void interleave_block<8, 4, VLType::None, true>(
"5:" // Main loop skip
"cbz %x[width], 14f\n"
"tbz %x[width], #3, 9f\n"
- "ldr d30, [x28], #0x8\n"
- "ldr d29, [x27], #0x8\n"
- "ldr d28, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "ldr d20, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d19, [x22], #0x8\n"
- "ldr d18, [x21], #0x8\n"
+ "ldr d29, [x28], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d27, [x26], #0x8\n"
+ "ldr d26, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
"tbz %x[width], #2, 7f\n"
- "ld1 { v30.s }[2], [x28], #0x4\n"
- "ld1 { v29.s }[2], [x27], #0x4\n"
- "ld1 { v28.s }[2], [x26], #0x4\n"
- "ld1 { v27.s }[2], [x25], #0x4\n"
- "ld1 { v20.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v19.s }[2], [x22], #0x4\n"
- "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v29.s }[2], [x28], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v27.s }[2], [x26], #0x4\n"
+ "ld1 { v26.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v30.h }[6], [x28], #0x2\n"
- "ld1 { v29.h }[6], [x27], #0x2\n"
+ "ld1 { v29.h }[6], [x28], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
"mov x20, #0x4\n"
- "ld1 { v28.h }[6], [x26], #0x2\n"
- "ld1 { v27.h }[6], [x25], #0x2\n"
- "ld1 { v20.h }[6], [x24], #0x2\n"
- "ld1 { v26.h }[6], [x23], #0x2\n"
- "ld1 { v19.h }[6], [x22], #0x2\n"
- "ld1 { v18.h }[6], [x21], #0x2\n"
+ "ld1 { v27.h }[6], [x26], #0x2\n"
+ "ld1 { v26.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v24.h }[6], [x23], #0x2\n"
+ "ld1 { v23.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[14], [x28]\n"
- "ld1 { v29.b }[14], [x27]\n"
- "ld1 { v28.b }[14], [x26]\n"
- "ld1 { v27.b }[14], [x25]\n"
- "ld1 { v20.b }[14], [x24]\n"
- "ld1 { v26.b }[14], [x23]\n"
- "ld1 { v19.b }[14], [x22]\n"
- "ld1 { v18.b }[14], [x21]\n"
+ "ld1 { v29.b }[14], [x28]\n"
+ "ld1 { v28.b }[14], [x27]\n"
+ "ld1 { v27.b }[14], [x26]\n"
+ "ld1 { v26.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v24.b }[14], [x23]\n"
+ "ld1 { v23.b }[14], [x22]\n"
+ "ld1 { v22.b }[14], [x21]\n"
"b 13f\n"
"6:" // odd_loads_1_12
"mov x20, #0x3\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[12], [x28]\n"
- "ld1 { v29.b }[12], [x27]\n"
+ "ld1 { v29.b }[12], [x28]\n"
+ "ld1 { v28.b }[12], [x27]\n"
"mov x20, #0x4\n"
- "ld1 { v28.b }[12], [x26]\n"
- "ld1 { v27.b }[12], [x25]\n"
- "ld1 { v20.b }[12], [x24]\n"
- "ld1 { v26.b }[12], [x23]\n"
- "ld1 { v19.b }[12], [x22]\n"
- "ld1 { v18.b }[12], [x21]\n"
+ "ld1 { v27.b }[12], [x26]\n"
+ "ld1 { v26.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v24.b }[12], [x23]\n"
+ "ld1 { v23.b }[12], [x22]\n"
+ "ld1 { v22.b }[12], [x21]\n"
"b 13f\n"
"7:" // odd_loads_2_8
"tbz %x[width], #1, 8f\n"
- "ld1 { v30.h }[4], [x28], #0x2\n"
- "ld1 { v29.h }[4], [x27], #0x2\n"
+ "ld1 { v29.h }[4], [x28], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
"mov x20, #0x3\n"
- "ld1 { v28.h }[4], [x26], #0x2\n"
- "ld1 { v27.h }[4], [x25], #0x2\n"
- "ld1 { v20.h }[4], [x24], #0x2\n"
- "ld1 { v26.h }[4], [x23], #0x2\n"
- "ld1 { v19.h }[4], [x22], #0x2\n"
- "ld1 { v18.h }[4], [x21], #0x2\n"
+ "ld1 { v27.h }[4], [x26], #0x2\n"
+ "ld1 { v26.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v24.h }[4], [x23], #0x2\n"
+ "ld1 { v23.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[10], [x28]\n"
- "ld1 { v29.b }[10], [x27]\n"
- "ld1 { v28.b }[10], [x26]\n"
- "ld1 { v27.b }[10], [x25]\n"
- "ld1 { v20.b }[10], [x24]\n"
- "ld1 { v26.b }[10], [x23]\n"
- "ld1 { v19.b }[10], [x22]\n"
- "ld1 { v18.b }[10], [x21]\n"
+ "ld1 { v29.b }[10], [x28]\n"
+ "ld1 { v28.b }[10], [x27]\n"
+ "ld1 { v27.b }[10], [x26]\n"
+ "ld1 { v26.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v24.b }[10], [x23]\n"
+ "ld1 { v23.b }[10], [x22]\n"
+ "ld1 { v22.b }[10], [x21]\n"
"b 13f\n"
"8:" // odd_loads_1_8
"mov x20, #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[8], [x28]\n"
- "ld1 { v29.b }[8], [x27]\n"
+ "ld1 { v29.b }[8], [x28]\n"
+ "ld1 { v28.b }[8], [x27]\n"
"mov x20, #0x3\n"
- "ld1 { v28.b }[8], [x26]\n"
- "ld1 { v27.b }[8], [x25]\n"
- "ld1 { v20.b }[8], [x24]\n"
- "ld1 { v26.b }[8], [x23]\n"
- "ld1 { v19.b }[8], [x22]\n"
- "ld1 { v18.b }[8], [x21]\n"
+ "ld1 { v27.b }[8], [x26]\n"
+ "ld1 { v26.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v24.b }[8], [x23]\n"
+ "ld1 { v23.b }[8], [x22]\n"
+ "ld1 { v22.b }[8], [x21]\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
- "ldr s30, [x28], #0x4\n"
- "ldr s29, [x27], #0x4\n"
- "ldr s28, [x26], #0x4\n"
- "ldr s27, [x25], #0x4\n"
- "ldr s20, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s19, [x22], #0x4\n"
- "ldr s18, [x21], #0x4\n"
+ "ldr s29, [x28], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s27, [x26], #0x4\n"
+ "ldr s26, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
"tbz %x[width], #1, 10f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x28], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
- "ld1 { v20.h }[2], [x24], #0x2\n"
- "ld1 { v26.h }[2], [x23], #0x2\n"
- "ld1 { v19.h }[2], [x22], #0x2\n"
- "ld1 { v18.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v26.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[6], [x28]\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
- "ld1 { v20.b }[6], [x24]\n"
- "ld1 { v26.b }[6], [x23]\n"
- "ld1 { v19.b }[6], [x22]\n"
- "ld1 { v18.b }[6], [x21]\n"
+ "ld1 { v29.b }[6], [x28]\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v26.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v22.b }[6], [x21]\n"
"b 13f\n"
"10:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[4], [x28]\n"
- "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x28]\n"
+ "ld1 { v28.b }[4], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
- "ld1 { v20.b }[4], [x24]\n"
- "ld1 { v26.b }[4], [x23]\n"
- "ld1 { v19.b }[4], [x22]\n"
- "ld1 { v18.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v26.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v22.b }[4], [x21]\n"
"b 13f\n"
"11:" // odd_loads_2_0
"tbz %x[width], #1, 12f\n"
- "ldr h30, [x28], #0x2\n"
- "ldr h29, [x27], #0x2\n"
+ "ldr h29, [x28], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
"mov x20, #0x1\n"
- "ldr h28, [x26], #0x2\n"
- "ldr h27, [x25], #0x2\n"
- "ldr h20, [x24], #0x2\n"
- "ldr h26, [x23], #0x2\n"
- "ldr h19, [x22], #0x2\n"
- "ldr h18, [x21], #0x2\n"
+ "ldr h27, [x26], #0x2\n"
+ "ldr h26, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h24, [x23], #0x2\n"
+ "ldr h23, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v30.b }[2], [x28]\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
- "ld1 { v20.b }[2], [x24]\n"
- "ld1 { v26.b }[2], [x23]\n"
- "ld1 { v19.b }[2], [x22]\n"
- "ld1 { v18.b }[2], [x21]\n"
+ "ld1 { v29.b }[2], [x28]\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v26.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v22.b }[2], [x21]\n"
"b 13f\n"
"12:" // odd_loads_1_0
- "ldr b30, [x28, #0x0]\n"
- "ldr b29, [x27, #0x0]\n"
+ "ldr b29, [x28, #0x0]\n"
+ "ldr b28, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b28, [x26, #0x0]\n"
- "ldr b27, [x25, #0x0]\n"
- "ldr b20, [x24, #0x0]\n"
- "ldr b26, [x23, #0x0]\n"
- "ldr b19, [x22, #0x0]\n"
- "ldr b18, [x21, #0x0]\n"
+ "ldr b27, [x26, #0x0]\n"
+ "ldr b26, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b24, [x23, #0x0]\n"
+ "ldr b23, [x22, #0x0]\n"
+ "ldr b22, [x21, #0x0]\n"
"13:" // Odd load end
- "zip1 v22.4s, v30.4s, v28.4s\n"
"zip1 v21.4s, v29.4s, v27.4s\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
"subs x20, x20, #0x1\n"
- "zip1 v17.4s, v20.4s, v19.4s\n"
- "zip1 v16.4s, v26.4s, v18.4s\n"
- "zip1 v25.4s, v22.4s, v21.4s\n"
- "zip1 v24.4s, v17.4s, v16.4s\n"
- "str q25, [%x[out_ptr], #0x0]\n"
- "uadalp v2.8h, v25.16b\n"
- "str q24, [%x[out_ptr], #0x10]\n"
- "uadalp v1.8h, v24.16b\n"
+ "zip1 v19.4s, v25.4s, v23.4s\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v21.4s, v20.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "uadalp v2.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "uadalp v1.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v23.4s, v22.4s, v21.4s\n"
- "zip2 v22.4s, v17.4s, v16.4s\n"
+ "zip2 v17.4s, v21.4s, v20.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"subs x20, x20, #0x1\n"
- "str q23, [%x[out_ptr], #0x0]\n"
- "uadalp v2.8h, v23.16b\n"
- "str q22, [%x[out_ptr], #0x10]\n"
- "uadalp v1.8h, v22.16b\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "uadalp v2.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "uadalp v1.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v21.4s, v30.4s, v28.4s\n"
- "zip2 v17.4s, v29.4s, v27.4s\n"
+ "zip2 v21.4s, v29.4s, v27.4s\n"
+ "zip2 v20.4s, v28.4s, v26.4s\n"
"subs x20, x20, #0x1\n"
- "zip2 v20.4s, v20.4s, v19.4s\n"
- "zip2 v16.4s, v26.4s, v18.4s\n"
- "zip1 v19.4s, v21.4s, v17.4s\n"
- "zip1 v18.4s, v20.4s, v16.4s\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "uadalp v2.8h, v19.16b\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "uadalp v1.8h, v18.16b\n"
+ "zip2 v19.4s, v25.4s, v23.4s\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v21.4s, v20.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "uadalp v2.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "uadalp v1.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v17.4s, v21.4s, v17.4s\n"
- "zip2 v16.4s, v20.4s, v16.4s\n"
+ "zip2 v17.4s, v21.4s, v20.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"str q17, [%x[out_ptr], #0x0]\n"
"uadalp v2.8h, v17.16b\n"
"str q16, [%x[out_ptr], #0x10]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
index 4bfb36082e..7b445ef3d4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -79,18 +79,18 @@ void interleave_block<8, 8, VLType::None, false>(
"prfm pldl1keep, [x21, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr q26, [x28], #0x10\n"
- "ldr q21, [x27], #0x10\n"
+ "ldr q20, [x28], #0x10\n"
+ "ldr q19, [x27], #0x10\n"
"subs %x[width], %x[width], #0x10\n"
"cmp %x[width], #0x10\n"
"ldr q25, [x26], #0x10\n"
"ldr q24, [x25], #0x10\n"
- "zip1 v16.2d, v26.2d, v21.2d\n"
+ "zip1 v16.2d, v20.2d, v19.2d\n"
"zip1 v18.2d, v25.2d, v24.2d\n"
"ldr q23, [x24], #0x10\n"
"ldr q22, [x23], #0x10\n"
"zip1 v17.2d, v23.2d, v22.2d\n"
- "zip2 v21.2d, v26.2d, v21.2d\n"
+ "zip2 v21.2d, v20.2d, v19.2d\n"
"ldr q20, [x22], #0x10\n"
"ldr q19, [x21], #0x10\n"
"str q16, [%x[out_ptr], #0x0]\n"
@@ -118,188 +118,187 @@ void interleave_block<8, 8, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 12f\n"
"tbz %x[width], #3, 7f\n"
- "ldr d26, [x28], #0x8\n"
- "ldr d21, [x27], #0x8\n"
- "ldr d25, [x26], #0x8\n"
- "ldr d24, [x25], #0x8\n"
- "ldr d23, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
- "ldr d20, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d25, [x28], #0x8\n"
+ "ldr d24, [x27], #0x8\n"
+ "ldr d23, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
"tbz %x[width], #2, 5f\n"
- "ld1 { v26.s }[2], [x28], #0x4\n"
- "ld1 { v21.s }[2], [x27], #0x4\n"
- "ld1 { v25.s }[2], [x26], #0x4\n"
- "ld1 { v24.s }[2], [x25], #0x4\n"
- "ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
- "ld1 { v20.s }[2], [x22], #0x4\n"
- "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v25.s }[2], [x28], #0x4\n"
+ "ld1 { v24.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x26], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v26.h }[6], [x28], #0x2\n"
- "ld1 { v21.h }[6], [x27], #0x2\n"
+ "ld1 { v25.h }[6], [x28], #0x2\n"
+ "ld1 { v24.h }[6], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v25.h }[6], [x26], #0x2\n"
- "ld1 { v24.h }[6], [x25], #0x2\n"
- "ld1 { v23.h }[6], [x24], #0x2\n"
- "ld1 { v22.h }[6], [x23], #0x2\n"
- "ld1 { v20.h }[6], [x22], #0x2\n"
- "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v23.h }[6], [x26], #0x2\n"
+ "ld1 { v22.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "ld1 { v18.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[14], [x28]\n"
- "ld1 { v21.b }[14], [x27]\n"
- "ld1 { v25.b }[14], [x26]\n"
- "ld1 { v24.b }[14], [x25]\n"
- "ld1 { v23.b }[14], [x24]\n"
- "ld1 { v22.b }[14], [x23]\n"
- "ld1 { v20.b }[14], [x22]\n"
- "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v25.b }[14], [x28]\n"
+ "ld1 { v24.b }[14], [x27]\n"
+ "ld1 { v23.b }[14], [x26]\n"
+ "ld1 { v22.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v20.b }[14], [x23]\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "ld1 { v18.b }[14], [x21]\n"
"b 11f\n"
"4:" // odd_loads_1_12
"mov x20, #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[12], [x28]\n"
- "ld1 { v21.b }[12], [x27]\n"
- "ld1 { v25.b }[12], [x26]\n"
- "ld1 { v24.b }[12], [x25]\n"
- "ld1 { v23.b }[12], [x24]\n"
- "ld1 { v22.b }[12], [x23]\n"
- "ld1 { v20.b }[12], [x22]\n"
- "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v25.b }[12], [x28]\n"
+ "ld1 { v24.b }[12], [x27]\n"
+ "ld1 { v23.b }[12], [x26]\n"
+ "ld1 { v22.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v20.b }[12], [x23]\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "ld1 { v18.b }[12], [x21]\n"
"b 11f\n"
"5:" // odd_loads_2_8
"tbz %x[width], #1, 6f\n"
- "ld1 { v26.h }[4], [x28], #0x2\n"
- "ld1 { v21.h }[4], [x27], #0x2\n"
+ "ld1 { v25.h }[4], [x28], #0x2\n"
+ "ld1 { v24.h }[4], [x27], #0x2\n"
"mov x20, #0x2\n"
- "ld1 { v25.h }[4], [x26], #0x2\n"
- "ld1 { v24.h }[4], [x25], #0x2\n"
- "ld1 { v23.h }[4], [x24], #0x2\n"
- "ld1 { v22.h }[4], [x23], #0x2\n"
- "ld1 { v20.h }[4], [x22], #0x2\n"
- "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v23.h }[4], [x26], #0x2\n"
+ "ld1 { v22.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "ld1 { v18.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[10], [x28]\n"
- "ld1 { v21.b }[10], [x27]\n"
- "ld1 { v25.b }[10], [x26]\n"
- "ld1 { v24.b }[10], [x25]\n"
- "ld1 { v23.b }[10], [x24]\n"
- "ld1 { v22.b }[10], [x23]\n"
- "ld1 { v20.b }[10], [x22]\n"
- "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v25.b }[10], [x28]\n"
+ "ld1 { v24.b }[10], [x27]\n"
+ "ld1 { v23.b }[10], [x26]\n"
+ "ld1 { v22.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v20.b }[10], [x23]\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "ld1 { v18.b }[10], [x21]\n"
"b 11f\n"
"6:" // odd_loads_1_8
"mov x20, #0x1\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[8], [x28]\n"
- "ld1 { v21.b }[8], [x27]\n"
+ "ld1 { v25.b }[8], [x28]\n"
+ "ld1 { v24.b }[8], [x27]\n"
"mov x20, #0x2\n"
- "ld1 { v25.b }[8], [x26]\n"
- "ld1 { v24.b }[8], [x25]\n"
- "ld1 { v23.b }[8], [x24]\n"
- "ld1 { v22.b }[8], [x23]\n"
- "ld1 { v20.b }[8], [x22]\n"
- "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v23.b }[8], [x26]\n"
+ "ld1 { v22.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v20.b }[8], [x23]\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "ld1 { v18.b }[8], [x21]\n"
"b 11f\n"
"7:" // odd_loads_4_0
"tbz %x[width], #2, 9f\n"
- "ldr s26, [x28], #0x4\n"
- "ldr s21, [x27], #0x4\n"
- "ldr s25, [x26], #0x4\n"
- "ldr s24, [x25], #0x4\n"
- "ldr s23, [x24], #0x4\n"
- "ldr s22, [x23], #0x4\n"
- "ldr s20, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
+ "ldr s25, [x28], #0x4\n"
+ "ldr s24, [x27], #0x4\n"
+ "ldr s23, [x26], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
"tbz %x[width], #1, 8f\n"
- "ld1 { v26.h }[2], [x28], #0x2\n"
- "ld1 { v21.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x28], #0x2\n"
+ "ld1 { v24.h }[2], [x27], #0x2\n"
"mov x20, #0x1\n"
- "ld1 { v25.h }[2], [x26], #0x2\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v22.h }[2], [x23], #0x2\n"
- "ld1 { v20.h }[2], [x22], #0x2\n"
- "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v23.h }[2], [x26], #0x2\n"
+ "ld1 { v22.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v18.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[6], [x28]\n"
- "ld1 { v21.b }[6], [x27]\n"
- "ld1 { v25.b }[6], [x26]\n"
- "ld1 { v24.b }[6], [x25]\n"
- "ld1 { v23.b }[6], [x24]\n"
- "ld1 { v22.b }[6], [x23]\n"
- "ld1 { v20.b }[6], [x22]\n"
- "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x28]\n"
+ "ld1 { v24.b }[6], [x27]\n"
+ "ld1 { v23.b }[6], [x26]\n"
+ "ld1 { v22.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v20.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v18.b }[6], [x21]\n"
"b 11f\n"
"8:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[4], [x28]\n"
- "ld1 { v21.b }[4], [x27]\n"
- "ld1 { v25.b }[4], [x26]\n"
- "ld1 { v24.b }[4], [x25]\n"
- "ld1 { v23.b }[4], [x24]\n"
- "ld1 { v22.b }[4], [x23]\n"
- "ld1 { v20.b }[4], [x22]\n"
- "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x28]\n"
+ "ld1 { v24.b }[4], [x27]\n"
+ "ld1 { v23.b }[4], [x26]\n"
+ "ld1 { v22.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v20.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v18.b }[4], [x21]\n"
"b 11f\n"
"9:" // odd_loads_2_0
"tbz %x[width], #1, 10f\n"
- "ldr h26, [x28], #0x2\n"
- "ldr h21, [x27], #0x2\n"
+ "ldr h25, [x28], #0x2\n"
+ "ldr h24, [x27], #0x2\n"
"mov x20, #0x1\n"
- "ldr h25, [x26], #0x2\n"
- "ldr h24, [x25], #0x2\n"
- "ldr h23, [x24], #0x2\n"
- "ldr h22, [x23], #0x2\n"
- "ldr h20, [x22], #0x2\n"
- "ldr h19, [x21], #0x2\n"
+ "ldr h23, [x26], #0x2\n"
+ "ldr h22, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h19, [x22], #0x2\n"
+ "ldr h18, [x21], #0x2\n"
"tbz %x[width], #0, 11f\n"
- "ld1 { v26.b }[2], [x28]\n"
- "ld1 { v21.b }[2], [x27]\n"
- "ld1 { v25.b }[2], [x26]\n"
- "ld1 { v24.b }[2], [x25]\n"
- "ld1 { v23.b }[2], [x24]\n"
- "ld1 { v22.b }[2], [x23]\n"
- "ld1 { v20.b }[2], [x22]\n"
- "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x28]\n"
+ "ld1 { v24.b }[2], [x27]\n"
+ "ld1 { v23.b }[2], [x26]\n"
+ "ld1 { v22.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v20.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v18.b }[2], [x21]\n"
"b 11f\n"
"10:" // odd_loads_1_0
- "ldr b26, [x28, #0x0]\n"
- "ldr b21, [x27, #0x0]\n"
+ "ldr b25, [x28, #0x0]\n"
+ "ldr b24, [x27, #0x0]\n"
"mov x20, #0x1\n"
- "ldr b25, [x26, #0x0]\n"
- "ldr b24, [x25, #0x0]\n"
- "ldr b23, [x24, #0x0]\n"
- "ldr b22, [x23, #0x0]\n"
- "ldr b20, [x22, #0x0]\n"
- "ldr b19, [x21, #0x0]\n"
+ "ldr b23, [x26, #0x0]\n"
+ "ldr b22, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b20, [x23, #0x0]\n"
+ "ldr b19, [x22, #0x0]\n"
+ "ldr b18, [x21, #0x0]\n"
"11:" // Odd load end
"subs x20, x20, #0x1\n"
- "zip1 v16.2d, v26.2d, v21.2d\n"
+ "zip1 v16.2d, v25.2d, v24.2d\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.2d, v25.2d, v24.2d\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "zip1 v17.2d, v23.2d, v22.2d\n"
- "zip1 v16.2d, v20.2d, v19.2d\n"
+ "zip1 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v21.2d, v20.2d\n"
+ "zip1 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 12f\n"
- "zip2 v21.2d, v26.2d, v21.2d\n"
- "str q21, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v25.2d, v24.2d\n"
- "str q18, [%x[out_ptr], #0x10]\n"
- "zip2 v17.2d, v23.2d, v22.2d\n"
- "zip2 v16.2d, v20.2d, v19.2d\n"
+ "zip2 v16.2d, v25.2d, v24.2d\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v23.2d, v22.2d\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.2d, v21.2d, v20.2d\n"
+ "zip2 v16.2d, v19.2d, v18.2d\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"12:" // Odds skip
-
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
index c6ad2949f5..a2288e8299 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -156,182 +156,182 @@ void interleave_block<8, 8, VLType::None, true>(
"cbz %x[width], 14f\n"
"tbz %x[width], #3, 9f\n"
"ldr d27, [x28], #0x8\n"
- "ldr d19, [x27], #0x8\n"
+ "ldr d26, [x27], #0x8\n"
"ldr d25, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
"ldr d23, [x24], #0x8\n"
- "ldr d17, [x23], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"ldr d21, [x22], #0x8\n"
- "ldr d16, [x21], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
"tbz %x[width], #2, 7f\n"
"ld1 { v27.s }[2], [x28], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x27], #0x4\n"
"ld1 { v25.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x25], #0x4\n"
"ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v17.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
"ld1 { v21.s }[2], [x22], #0x4\n"
- "ld1 { v16.s }[2], [x21], #0x4\n"
+ "ld1 { v20.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v27.h }[6], [x28], #0x2\n"
- "ld1 { v19.h }[6], [x27], #0x2\n"
+ "ld1 { v26.h }[6], [x27], #0x2\n"
"mov x20, #0x2\n"
"ld1 { v25.h }[6], [x26], #0x2\n"
- "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v24.h }[6], [x25], #0x2\n"
"ld1 { v23.h }[6], [x24], #0x2\n"
- "ld1 { v17.h }[6], [x23], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
"ld1 { v21.h }[6], [x22], #0x2\n"
- "ld1 { v16.h }[6], [x21], #0x2\n"
+ "ld1 { v20.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[14], [x28]\n"
- "ld1 { v19.b }[14], [x27]\n"
+ "ld1 { v26.b }[14], [x27]\n"
"ld1 { v25.b }[14], [x26]\n"
- "ld1 { v18.b }[14], [x25]\n"
+ "ld1 { v24.b }[14], [x25]\n"
"ld1 { v23.b }[14], [x24]\n"
- "ld1 { v17.b }[14], [x23]\n"
+ "ld1 { v22.b }[14], [x23]\n"
"ld1 { v21.b }[14], [x22]\n"
- "ld1 { v16.b }[14], [x21]\n"
+ "ld1 { v20.b }[14], [x21]\n"
"b 13f\n"
"6:" // odd_loads_1_12
"mov x20, #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[12], [x28]\n"
- "ld1 { v19.b }[12], [x27]\n"
+ "ld1 { v26.b }[12], [x27]\n"
"ld1 { v25.b }[12], [x26]\n"
- "ld1 { v18.b }[12], [x25]\n"
+ "ld1 { v24.b }[12], [x25]\n"
"ld1 { v23.b }[12], [x24]\n"
- "ld1 { v17.b }[12], [x23]\n"
+ "ld1 { v22.b }[12], [x23]\n"
"ld1 { v21.b }[12], [x22]\n"
- "ld1 { v16.b }[12], [x21]\n"
+ "ld1 { v20.b }[12], [x21]\n"
"b 13f\n"
"7:" // odd_loads_2_8
"tbz %x[width], #1, 8f\n"
"ld1 { v27.h }[4], [x28], #0x2\n"
- "ld1 { v19.h }[4], [x27], #0x2\n"
+ "ld1 { v26.h }[4], [x27], #0x2\n"
"mov x20, #0x2\n"
"ld1 { v25.h }[4], [x26], #0x2\n"
- "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v24.h }[4], [x25], #0x2\n"
"ld1 { v23.h }[4], [x24], #0x2\n"
- "ld1 { v17.h }[4], [x23], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
"ld1 { v21.h }[4], [x22], #0x2\n"
- "ld1 { v16.h }[4], [x21], #0x2\n"
+ "ld1 { v20.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[10], [x28]\n"
- "ld1 { v19.b }[10], [x27]\n"
+ "ld1 { v26.b }[10], [x27]\n"
"ld1 { v25.b }[10], [x26]\n"
- "ld1 { v18.b }[10], [x25]\n"
+ "ld1 { v24.b }[10], [x25]\n"
"ld1 { v23.b }[10], [x24]\n"
- "ld1 { v17.b }[10], [x23]\n"
+ "ld1 { v22.b }[10], [x23]\n"
"ld1 { v21.b }[10], [x22]\n"
- "ld1 { v16.b }[10], [x21]\n"
+ "ld1 { v20.b }[10], [x21]\n"
"b 13f\n"
"8:" // odd_loads_1_8
"mov x20, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[8], [x28]\n"
- "ld1 { v19.b }[8], [x27]\n"
+ "ld1 { v26.b }[8], [x27]\n"
"mov x20, #0x2\n"
"ld1 { v25.b }[8], [x26]\n"
- "ld1 { v18.b }[8], [x25]\n"
+ "ld1 { v24.b }[8], [x25]\n"
"ld1 { v23.b }[8], [x24]\n"
- "ld1 { v17.b }[8], [x23]\n"
+ "ld1 { v22.b }[8], [x23]\n"
"ld1 { v21.b }[8], [x22]\n"
- "ld1 { v16.b }[8], [x21]\n"
+ "ld1 { v20.b }[8], [x21]\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
"ldr s27, [x28], #0x4\n"
- "ldr s19, [x27], #0x4\n"
+ "ldr s26, [x27], #0x4\n"
"ldr s25, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
+ "ldr s24, [x25], #0x4\n"
"ldr s23, [x24], #0x4\n"
- "ldr s17, [x23], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
"ldr s21, [x22], #0x4\n"
- "ldr s16, [x21], #0x4\n"
+ "ldr s20, [x21], #0x4\n"
"tbz %x[width], #1, 10f\n"
"ld1 { v27.h }[2], [x28], #0x2\n"
- "ld1 { v19.h }[2], [x27], #0x2\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
"mov x20, #0x1\n"
"ld1 { v25.h }[2], [x26], #0x2\n"
- "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
"ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v17.h }[2], [x23], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
"ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v16.h }[2], [x21], #0x2\n"
+ "ld1 { v20.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[6], [x28]\n"
- "ld1 { v19.b }[6], [x27]\n"
+ "ld1 { v26.b }[6], [x27]\n"
"ld1 { v25.b }[6], [x26]\n"
- "ld1 { v18.b }[6], [x25]\n"
+ "ld1 { v24.b }[6], [x25]\n"
"ld1 { v23.b }[6], [x24]\n"
- "ld1 { v17.b }[6], [x23]\n"
+ "ld1 { v22.b }[6], [x23]\n"
"ld1 { v21.b }[6], [x22]\n"
- "ld1 { v16.b }[6], [x21]\n"
+ "ld1 { v20.b }[6], [x21]\n"
"b 13f\n"
"10:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[4], [x28]\n"
- "ld1 { v19.b }[4], [x27]\n"
+ "ld1 { v26.b }[4], [x27]\n"
"ld1 { v25.b }[4], [x26]\n"
- "ld1 { v18.b }[4], [x25]\n"
+ "ld1 { v24.b }[4], [x25]\n"
"ld1 { v23.b }[4], [x24]\n"
- "ld1 { v17.b }[4], [x23]\n"
+ "ld1 { v22.b }[4], [x23]\n"
"ld1 { v21.b }[4], [x22]\n"
- "ld1 { v16.b }[4], [x21]\n"
+ "ld1 { v20.b }[4], [x21]\n"
"b 13f\n"
"11:" // odd_loads_2_0
"tbz %x[width], #1, 12f\n"
"ldr h27, [x28], #0x2\n"
- "ldr h19, [x27], #0x2\n"
+ "ldr h26, [x27], #0x2\n"
"mov x20, #0x1\n"
"ldr h25, [x26], #0x2\n"
- "ldr h18, [x25], #0x2\n"
+ "ldr h24, [x25], #0x2\n"
"ldr h23, [x24], #0x2\n"
- "ldr h17, [x23], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
"ldr h21, [x22], #0x2\n"
- "ldr h16, [x21], #0x2\n"
+ "ldr h20, [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[2], [x28]\n"
- "ld1 { v19.b }[2], [x27]\n"
+ "ld1 { v26.b }[2], [x27]\n"
"ld1 { v25.b }[2], [x26]\n"
- "ld1 { v18.b }[2], [x25]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"ld1 { v23.b }[2], [x24]\n"
- "ld1 { v17.b }[2], [x23]\n"
+ "ld1 { v22.b }[2], [x23]\n"
"ld1 { v21.b }[2], [x22]\n"
- "ld1 { v16.b }[2], [x21]\n"
+ "ld1 { v20.b }[2], [x21]\n"
"b 13f\n"
"12:" // odd_loads_1_0
"ldr b27, [x28, #0x0]\n"
- "ldr b19, [x27, #0x0]\n"
+ "ldr b26, [x27, #0x0]\n"
"mov x20, #0x1\n"
"ldr b25, [x26, #0x0]\n"
- "ldr b18, [x25, #0x0]\n"
+ "ldr b24, [x25, #0x0]\n"
"ldr b23, [x24, #0x0]\n"
- "ldr b17, [x23, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
"ldr b21, [x22, #0x0]\n"
- "ldr b16, [x21, #0x0]\n"
+ "ldr b20, [x21, #0x0]\n"
"13:" // Odd load end
- "zip1 v26.2d, v27.2d, v19.2d\n"
- "zip1 v24.2d, v25.2d, v18.2d\n"
+ "zip1 v19.2d, v27.2d, v26.2d\n"
+ "zip1 v18.2d, v25.2d, v24.2d\n"
"subs x20, x20, #0x1\n"
- "str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v22.2d, v23.2d, v17.2d\n"
- "zip1 v20.2d, v21.2d, v16.2d\n"
- "str q24, [%x[out_ptr], #0x10]\n"
- "sadalp v5.8h, v26.16b\n"
- "sadalp v4.8h, v24.16b\n"
- "str q22, [%x[out_ptr], #0x20]\n"
- "sadalp v3.8h, v22.16b\n"
- "str q20, [%x[out_ptr], #0x30]\n"
- "sadalp v2.8h, v20.16b\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.2d, v23.2d, v22.2d\n"
+ "zip1 v16.2d, v21.2d, v20.2d\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "sadalp v5.8h, v19.16b\n"
+ "sadalp v4.8h, v18.16b\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "sadalp v3.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "sadalp v2.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 14f\n"
- "zip2 v19.2d, v27.2d, v19.2d\n"
- "zip2 v18.2d, v25.2d, v18.2d\n"
+ "zip2 v19.2d, v27.2d, v26.2d\n"
+ "zip2 v18.2d, v25.2d, v24.2d\n"
"str q19, [%x[out_ptr], #0x0]\n"
- "zip2 v17.2d, v23.2d, v17.2d\n"
- "zip2 v16.2d, v21.2d, v16.2d\n"
+ "zip2 v17.2d, v23.2d, v22.2d\n"
+ "zip2 v16.2d, v21.2d, v20.2d\n"
"str q18, [%x[out_ptr], #0x10]\n"
"sadalp v5.8h, v19.16b\n"
"sadalp v4.8h, v18.16b\n"
@@ -346,11 +346,11 @@ void interleave_block<8, 8, VLType::None, true>(
"sadalp v31.4s, v3.8h\n"
"sadalp v30.4s, v2.8h\n"
"addp v1.4s, v1.4s, v0.4s\n"
- "addp v0.4s, v31.4s, v30.4s\n"
+ "addp v16.4s, v31.4s, v30.4s\n"
"add v1.4s, v1.4s, v29.4s\n"
- "add v0.4s, v0.4s, v28.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
"str q1, [%x[out_ptr], #0x0]\n"
- "str q0, [%x[out_ptr], #0x10]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
index 6c4a5fa62b..56d34a8a64 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -156,182 +156,182 @@ void interleave_block<8, 8, VLType::None, true>(
"cbz %x[width], 14f\n"
"tbz %x[width], #3, 9f\n"
"ldr d27, [x28], #0x8\n"
- "ldr d19, [x27], #0x8\n"
+ "ldr d26, [x27], #0x8\n"
"ldr d25, [x26], #0x8\n"
- "ldr d18, [x25], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
"ldr d23, [x24], #0x8\n"
- "ldr d17, [x23], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
"ldr d21, [x22], #0x8\n"
- "ldr d16, [x21], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
"tbz %x[width], #2, 7f\n"
"ld1 { v27.s }[2], [x28], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x27], #0x4\n"
"ld1 { v25.s }[2], [x26], #0x4\n"
- "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x25], #0x4\n"
"ld1 { v23.s }[2], [x24], #0x4\n"
- "ld1 { v17.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
"ld1 { v21.s }[2], [x22], #0x4\n"
- "ld1 { v16.s }[2], [x21], #0x4\n"
+ "ld1 { v20.s }[2], [x21], #0x4\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v27.h }[6], [x28], #0x2\n"
- "ld1 { v19.h }[6], [x27], #0x2\n"
+ "ld1 { v26.h }[6], [x27], #0x2\n"
"mov x20, #0x2\n"
"ld1 { v25.h }[6], [x26], #0x2\n"
- "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v24.h }[6], [x25], #0x2\n"
"ld1 { v23.h }[6], [x24], #0x2\n"
- "ld1 { v17.h }[6], [x23], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
"ld1 { v21.h }[6], [x22], #0x2\n"
- "ld1 { v16.h }[6], [x21], #0x2\n"
+ "ld1 { v20.h }[6], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[14], [x28]\n"
- "ld1 { v19.b }[14], [x27]\n"
+ "ld1 { v26.b }[14], [x27]\n"
"ld1 { v25.b }[14], [x26]\n"
- "ld1 { v18.b }[14], [x25]\n"
+ "ld1 { v24.b }[14], [x25]\n"
"ld1 { v23.b }[14], [x24]\n"
- "ld1 { v17.b }[14], [x23]\n"
+ "ld1 { v22.b }[14], [x23]\n"
"ld1 { v21.b }[14], [x22]\n"
- "ld1 { v16.b }[14], [x21]\n"
+ "ld1 { v20.b }[14], [x21]\n"
"b 13f\n"
"6:" // odd_loads_1_12
"mov x20, #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[12], [x28]\n"
- "ld1 { v19.b }[12], [x27]\n"
+ "ld1 { v26.b }[12], [x27]\n"
"ld1 { v25.b }[12], [x26]\n"
- "ld1 { v18.b }[12], [x25]\n"
+ "ld1 { v24.b }[12], [x25]\n"
"ld1 { v23.b }[12], [x24]\n"
- "ld1 { v17.b }[12], [x23]\n"
+ "ld1 { v22.b }[12], [x23]\n"
"ld1 { v21.b }[12], [x22]\n"
- "ld1 { v16.b }[12], [x21]\n"
+ "ld1 { v20.b }[12], [x21]\n"
"b 13f\n"
"7:" // odd_loads_2_8
"tbz %x[width], #1, 8f\n"
"ld1 { v27.h }[4], [x28], #0x2\n"
- "ld1 { v19.h }[4], [x27], #0x2\n"
+ "ld1 { v26.h }[4], [x27], #0x2\n"
"mov x20, #0x2\n"
"ld1 { v25.h }[4], [x26], #0x2\n"
- "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v24.h }[4], [x25], #0x2\n"
"ld1 { v23.h }[4], [x24], #0x2\n"
- "ld1 { v17.h }[4], [x23], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
"ld1 { v21.h }[4], [x22], #0x2\n"
- "ld1 { v16.h }[4], [x21], #0x2\n"
+ "ld1 { v20.h }[4], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[10], [x28]\n"
- "ld1 { v19.b }[10], [x27]\n"
+ "ld1 { v26.b }[10], [x27]\n"
"ld1 { v25.b }[10], [x26]\n"
- "ld1 { v18.b }[10], [x25]\n"
+ "ld1 { v24.b }[10], [x25]\n"
"ld1 { v23.b }[10], [x24]\n"
- "ld1 { v17.b }[10], [x23]\n"
+ "ld1 { v22.b }[10], [x23]\n"
"ld1 { v21.b }[10], [x22]\n"
- "ld1 { v16.b }[10], [x21]\n"
+ "ld1 { v20.b }[10], [x21]\n"
"b 13f\n"
"8:" // odd_loads_1_8
"mov x20, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[8], [x28]\n"
- "ld1 { v19.b }[8], [x27]\n"
+ "ld1 { v26.b }[8], [x27]\n"
"mov x20, #0x2\n"
"ld1 { v25.b }[8], [x26]\n"
- "ld1 { v18.b }[8], [x25]\n"
+ "ld1 { v24.b }[8], [x25]\n"
"ld1 { v23.b }[8], [x24]\n"
- "ld1 { v17.b }[8], [x23]\n"
+ "ld1 { v22.b }[8], [x23]\n"
"ld1 { v21.b }[8], [x22]\n"
- "ld1 { v16.b }[8], [x21]\n"
+ "ld1 { v20.b }[8], [x21]\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
"ldr s27, [x28], #0x4\n"
- "ldr s19, [x27], #0x4\n"
+ "ldr s26, [x27], #0x4\n"
"ldr s25, [x26], #0x4\n"
- "ldr s18, [x25], #0x4\n"
+ "ldr s24, [x25], #0x4\n"
"ldr s23, [x24], #0x4\n"
- "ldr s17, [x23], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
"ldr s21, [x22], #0x4\n"
- "ldr s16, [x21], #0x4\n"
+ "ldr s20, [x21], #0x4\n"
"tbz %x[width], #1, 10f\n"
"ld1 { v27.h }[2], [x28], #0x2\n"
- "ld1 { v19.h }[2], [x27], #0x2\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
"mov x20, #0x1\n"
"ld1 { v25.h }[2], [x26], #0x2\n"
- "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
"ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v17.h }[2], [x23], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
"ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v16.h }[2], [x21], #0x2\n"
+ "ld1 { v20.h }[2], [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[6], [x28]\n"
- "ld1 { v19.b }[6], [x27]\n"
+ "ld1 { v26.b }[6], [x27]\n"
"ld1 { v25.b }[6], [x26]\n"
- "ld1 { v18.b }[6], [x25]\n"
+ "ld1 { v24.b }[6], [x25]\n"
"ld1 { v23.b }[6], [x24]\n"
- "ld1 { v17.b }[6], [x23]\n"
+ "ld1 { v22.b }[6], [x23]\n"
"ld1 { v21.b }[6], [x22]\n"
- "ld1 { v16.b }[6], [x21]\n"
+ "ld1 { v20.b }[6], [x21]\n"
"b 13f\n"
"10:" // odd_loads_1_4
"mov x20, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[4], [x28]\n"
- "ld1 { v19.b }[4], [x27]\n"
+ "ld1 { v26.b }[4], [x27]\n"
"ld1 { v25.b }[4], [x26]\n"
- "ld1 { v18.b }[4], [x25]\n"
+ "ld1 { v24.b }[4], [x25]\n"
"ld1 { v23.b }[4], [x24]\n"
- "ld1 { v17.b }[4], [x23]\n"
+ "ld1 { v22.b }[4], [x23]\n"
"ld1 { v21.b }[4], [x22]\n"
- "ld1 { v16.b }[4], [x21]\n"
+ "ld1 { v20.b }[4], [x21]\n"
"b 13f\n"
"11:" // odd_loads_2_0
"tbz %x[width], #1, 12f\n"
"ldr h27, [x28], #0x2\n"
- "ldr h19, [x27], #0x2\n"
+ "ldr h26, [x27], #0x2\n"
"mov x20, #0x1\n"
"ldr h25, [x26], #0x2\n"
- "ldr h18, [x25], #0x2\n"
+ "ldr h24, [x25], #0x2\n"
"ldr h23, [x24], #0x2\n"
- "ldr h17, [x23], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
"ldr h21, [x22], #0x2\n"
- "ldr h16, [x21], #0x2\n"
+ "ldr h20, [x21], #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[2], [x28]\n"
- "ld1 { v19.b }[2], [x27]\n"
+ "ld1 { v26.b }[2], [x27]\n"
"ld1 { v25.b }[2], [x26]\n"
- "ld1 { v18.b }[2], [x25]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"ld1 { v23.b }[2], [x24]\n"
- "ld1 { v17.b }[2], [x23]\n"
+ "ld1 { v22.b }[2], [x23]\n"
"ld1 { v21.b }[2], [x22]\n"
- "ld1 { v16.b }[2], [x21]\n"
+ "ld1 { v20.b }[2], [x21]\n"
"b 13f\n"
"12:" // odd_loads_1_0
"ldr b27, [x28, #0x0]\n"
- "ldr b19, [x27, #0x0]\n"
+ "ldr b26, [x27, #0x0]\n"
"mov x20, #0x1\n"
"ldr b25, [x26, #0x0]\n"
- "ldr b18, [x25, #0x0]\n"
+ "ldr b24, [x25, #0x0]\n"
"ldr b23, [x24, #0x0]\n"
- "ldr b17, [x23, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
"ldr b21, [x22, #0x0]\n"
- "ldr b16, [x21, #0x0]\n"
+ "ldr b20, [x21, #0x0]\n"
"13:" // Odd load end
- "zip1 v26.2d, v27.2d, v19.2d\n"
- "zip1 v24.2d, v25.2d, v18.2d\n"
+ "zip1 v19.2d, v27.2d, v26.2d\n"
+ "zip1 v18.2d, v25.2d, v24.2d\n"
"subs x20, x20, #0x1\n"
- "str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v22.2d, v23.2d, v17.2d\n"
- "zip1 v20.2d, v21.2d, v16.2d\n"
- "str q24, [%x[out_ptr], #0x10]\n"
- "uadalp v5.8h, v26.16b\n"
- "uadalp v4.8h, v24.16b\n"
- "str q22, [%x[out_ptr], #0x20]\n"
- "uadalp v3.8h, v22.16b\n"
- "str q20, [%x[out_ptr], #0x30]\n"
- "uadalp v2.8h, v20.16b\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.2d, v23.2d, v22.2d\n"
+ "zip1 v16.2d, v21.2d, v20.2d\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "uadalp v5.8h, v19.16b\n"
+ "uadalp v4.8h, v18.16b\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "uadalp v3.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "uadalp v2.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 14f\n"
- "zip2 v19.2d, v27.2d, v19.2d\n"
- "zip2 v18.2d, v25.2d, v18.2d\n"
+ "zip2 v19.2d, v27.2d, v26.2d\n"
+ "zip2 v18.2d, v25.2d, v24.2d\n"
"str q19, [%x[out_ptr], #0x0]\n"
- "zip2 v17.2d, v23.2d, v17.2d\n"
- "zip2 v16.2d, v21.2d, v16.2d\n"
+ "zip2 v17.2d, v23.2d, v22.2d\n"
+ "zip2 v16.2d, v21.2d, v20.2d\n"
"str q18, [%x[out_ptr], #0x10]\n"
"uadalp v5.8h, v19.16b\n"
"uadalp v4.8h, v18.16b\n"
@@ -346,11 +346,11 @@ void interleave_block<8, 8, VLType::None, true>(
"uadalp v31.4s, v3.8h\n"
"uadalp v30.4s, v2.8h\n"
"addp v1.4s, v1.4s, v0.4s\n"
- "addp v0.4s, v31.4s, v30.4s\n"
+ "addp v16.4s, v31.4s, v30.4s\n"
"add v1.4s, v1.4s, v29.4s\n"
- "add v0.4s, v0.4s, v28.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
"str q1, [%x[out_ptr], #0x0]\n"
- "str q0, [%x[out_ptr], #0x10]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
: [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
index 51b91d16e1..a5f4754d3d 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
template <>
void interleave_block<1, 2, VLType::SME, false>(
bfloat16 * &out, const float * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x22, ALL, MUL #2\n"
@@ -153,4 +151,4 @@ void interleave_block<1, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
index 25bfad18b1..c1d0ac5bc7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
template <>
void interleave_block<2, 2, VLType::SME, false>(
bfloat16 * &out, const float * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x22, ALL, MUL #2\n"
@@ -184,4 +182,4 @@ void interleave_block<2, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
index 9255831e86..03575d7ff2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
template <>
void interleave_block<4, 2, VLType::SME, false>(
bfloat16 * &out, const float * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x23, ALL, MUL #2\n"
@@ -159,4 +157,4 @@ void interleave_block<4, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
index 9b66a6fb10..453778ae3f 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 1, VLType::SME, false>(
bfloat16 * &out, const bfloat16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"mov x21, %x[width]\n"
@@ -168,9 +166,9 @@ void interleave_block<1, 1, VLType::SME, false>(
"9:" // K loop: Tails: Even: First
".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
- "ldr x25, [x26, #0x0]\n"
+ "ldr x20, [x26, #0x0]\n"
".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0xe0560288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
"add x12, x12, #0x1\n"
"cmp x12, x11\n"
"add x26, x26, #0x8\n"
@@ -186,7 +184,7 @@ void interleave_block<1, 1, VLType::SME, false>(
"cmp x12, x10\n"
"addvl x21, x21, #1\n"
"blt 10b\n"
- "whilelt p9.h, x27, %x[width]\n"
+ "whilelt p8.h, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -206,4 +204,4 @@ void interleave_block<1, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
index d0375de76f..98bdcd2fa2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 2, VLType::SME, false>(
bfloat16 * &out, const bfloat16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cnth x22\n"
@@ -176,11 +174,11 @@ void interleave_block<1, 2, VLType::SME, false>(
"9:" // K loop: Tails: Even: First
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
- "ldr x25, [x26, #0x0]\n"
+ "ldr x20, [x26, #0x0]\n"
"add x12, x12, #0x1\n"
".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
"cmp x12, x10\n"
- ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0xe0562281 // ld1h { za0h.h[x13, #1] }, p0/Z, [x20, x22, LSL #1]\n"
"add x26, x26, #0x8\n"
"addvl x21, x21, #1\n"
"add x13, x13, #0x2\n"
@@ -197,7 +195,7 @@ void interleave_block<1, 2, VLType::SME, false>(
"addvl x21, x21, #1\n"
"add x20, x20, #0x2\n"
"blt 10b\n"
- "whilelt p9.h, x27, %x[width]\n"
+ "whilelt p8.h, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -217,4 +215,4 @@ void interleave_block<1, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
index 622d9aa4fc..4390bb7c7f 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 4, VLType::SME, false>(
int8_t * &out, const int8_t * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntb x21\n"
@@ -179,11 +177,11 @@ void interleave_block<1, 4, VLType::SME, false>(
"9:" // K loop: Tails: Even: First
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
- "ldr x25, [x26, #0x0]\n"
+ "ldr x20, [x26, #0x0]\n"
"add x12, x12, #0x1\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
"cmp x12, x9\n"
- ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+ ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
"add x26, x26, #0x8\n"
"addvl x21, x21, #1\n"
"add x13, x13, #0x4\n"
@@ -200,7 +198,7 @@ void interleave_block<1, 4, VLType::SME, false>(
"addvl x21, x21, #1\n"
"add x20, x20, #0x4\n"
"blt 10b\n"
- "whilelt p9.b, x27, %x[width]\n"
+ "whilelt p8.b, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -220,4 +218,4 @@ void interleave_block<1, 4, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
index 07f03702d9..f5ee261964 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 4, VLType::SME, true>(
@@ -200,12 +200,12 @@ void interleave_block<1, 4, VLType::SME, true>(
"10:" // K loop: Tails: Even: First
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
- "ldr x22, [x23, #0x0]\n"
+ "ldr x20, [x23, #0x0]\n"
".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
"sdot z17.s, z16.b, z18.b\n"
- ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+ ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
"cmp x12, x9\n"
"add x23, x23, #0x8\n"
"addvl x24, x24, #1\n"
@@ -225,7 +225,7 @@ void interleave_block<1, 4, VLType::SME, true>(
"addvl x24, x24, #1\n"
"add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p9.b, x28, %x[width]\n"
+ "whilelt p8.b, x28, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -249,4 +249,4 @@ void interleave_block<1, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
index 618570de08..76c1d053cd 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 4, VLType::SME, false>(
uint8_t * &out, const uint8_t * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntb x21\n"
@@ -179,11 +177,11 @@ void interleave_block<1, 4, VLType::SME, false>(
"9:" // K loop: Tails: Even: First
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
- "ldr x25, [x26, #0x0]\n"
+ "ldr x20, [x26, #0x0]\n"
"add x12, x12, #0x1\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
"cmp x12, x9\n"
- ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+ ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
"add x26, x26, #0x8\n"
"addvl x21, x21, #1\n"
"add x13, x13, #0x4\n"
@@ -200,7 +198,7 @@ void interleave_block<1, 4, VLType::SME, false>(
"addvl x21, x21, #1\n"
"add x20, x20, #0x4\n"
"blt 10b\n"
- "whilelt p9.b, x27, %x[width]\n"
+ "whilelt p8.b, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -220,4 +218,4 @@ void interleave_block<1, 4, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
index 646db0caa8..daf2d3a100 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 4, VLType::SME, true>(
@@ -200,12 +200,12 @@ void interleave_block<1, 4, VLType::SME, true>(
"10:" // K loop: Tails: Even: First
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
- "ldr x22, [x23, #0x0]\n"
+ "ldr x20, [x23, #0x0]\n"
".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
"add x12, x12, #0x1\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
"udot z17.s, z16.b, z18.b\n"
- ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+ ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
"cmp x12, x9\n"
"add x23, x23, #0x8\n"
"addvl x24, x24, #1\n"
@@ -225,7 +225,7 @@ void interleave_block<1, 4, VLType::SME, true>(
"addvl x24, x24, #1\n"
"add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p9.b, x28, %x[width]\n"
+ "whilelt p8.b, x28, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -249,4 +249,4 @@ void interleave_block<1, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
index 788c1a2eca..274f69f370 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 1, VLType::SME, false>(
__fp16 * &out, const __fp16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"mov x21, %x[width]\n"
@@ -168,9 +166,9 @@ void interleave_block<1, 1, VLType::SME, false>(
"9:" // K loop: Tails: Even: First
".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n"
".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
- "ldr x25, [x26, #0x0]\n"
+ "ldr x20, [x26, #0x0]\n"
".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n"
- ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+ ".inst 0xe0560288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
"add x12, x12, #0x1\n"
"cmp x12, x11\n"
"add x26, x26, #0x8\n"
@@ -186,7 +184,7 @@ void interleave_block<1, 1, VLType::SME, false>(
"cmp x12, x10\n"
"addvl x21, x21, #1\n"
"blt 10b\n"
- "whilelt p9.h, x27, %x[width]\n"
+ "whilelt p8.h, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -206,4 +204,4 @@ void interleave_block<1, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
index 7de88543d7..ab290649fd 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<1, 1, VLType::SME, false>(
float * &out, const float * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"mov x22, %x[width]\n"
@@ -167,9 +165,9 @@ void interleave_block<1, 1, VLType::SME, false>(
"9:" // K loop: Tails: Even: First
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
- "ldr x25, [x26, #0x0]\n"
+ "ldr x20, [x26, #0x0]\n"
".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n"
- ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0960288 // ld1w { za2h.s[x12] }, p0/Z, [x20, x22, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x10\n"
"add x26, x26, #0x8\n"
@@ -185,7 +183,7 @@ void interleave_block<1, 1, VLType::SME, false>(
"cmp x12, x9\n"
"addvl x21, x21, #1\n"
"blt 10b\n"
- "whilelt p9.s, x27, %x[width]\n"
+ "whilelt p8.s, x27, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -205,4 +203,4 @@ void interleave_block<1, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
index 14ee5d6304..dc6d12b61e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 1, VLType::SME, false>(
bfloat16 * &out, const bfloat16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cnth x28\n"
@@ -97,4 +95,4 @@ void interleave_block<2, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
index f648ccf771..d9189258c1 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
@@ -22,32 +22,30 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 2, VLType::SME, false>(
bfloat16 * &out, const bfloat16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "cnth x21\n"
- "mov x22, %x[width]\n"
- "inch x22\n"
+ "cnth x22\n"
+ "mov x21, %x[width]\n"
+ "inch x21\n"
"mov x20, %x[width]\n"
- "sub x17, x21, #0x1\n"
- "sub x22, x22, #0x1\n"
+ "sub x17, x22, #0x1\n"
+ "sub x21, x21, #0x1\n"
"ands x17, x20, x17\n"
"cntw x16\n"
- "udiv x22, x22, x21\n" // n_passes = ceildiv(width, VL<T>)
- "csel x17, x17, x21, NE\n"
- "sub x13, x22, #0x1\n"
+ "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x22, NE\n"
+ "sub x13, x21, #0x1\n"
"add x17, x17, #0x1\n"
"sub x15, x16, #0x2\n"
- "lsl x21, %x[height], #0x1\n" // height * 2
+ "lsl x22, %x[height], #0x1\n" // height * 2
"lsl x20, x16, #0x1\n"
"mov x14, #0x0\n"
"mov x11, %x[in]\n"
@@ -57,15 +55,15 @@ void interleave_block<2, 2, VLType::SME, false>(
"cntw x27, ALL, MUL #3\n"
"ldr x26, [x10, #0x0]\n"
"lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"ldr x24, [x11, #0x8]\n"
"lsr x17, x17, #0x1\n"
"ptrue p13.s\n"
- "ldr x23, [x10, #0x8]\n"
- "whilelt p12.h, XZR, x21\n"
- "whilelt p11.h, x20, x21\n"
- "mov x22, %x[row_offset]\n"
- "mov x21, %x[out]\n"
+ "ldr x21, [x10, #0x8]\n"
+ "whilelt p12.h, XZR, x22\n"
+ "whilelt p11.h, x20, x22\n"
+ "mov x23, %x[row_offset]\n"
+ "mov x22, %x[out]\n"
"whilelt p10.h, x14, %x[width]\n"
"whilelt p9.h, x14, %x[width]\n"
"whilelt p8.h, x14, %x[width]\n"
@@ -76,39 +74,39 @@ void interleave_block<2, 2, VLType::SME, false>(
"1:" // K loop: Charge: Loop
".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
"ldr x26, [x10, #0x0]\n"
- ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"add x12, x12, #0x4\n"
"cmp x12, x15, LSL #1\n"
- "ldr x23, [x10, #0x8]\n"
+ "ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
"blt 1b\n"
"2:" // K loop: Charge: End
".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
"mov x11, %x[in]\n"
"add x10, %x[in], x16, LSL #3\n"
- ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"ldr x26, [x10, #0x0]\n"
- "inch x22\n"
+ "inch x23\n"
"inch x14\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- "ldr x23, [x10, #0x8]\n"
+ "ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
"cbz x13, 8f\n"
"mov x20, x13\n"
@@ -121,60 +119,60 @@ void interleave_block<2, 2, VLType::SME, false>(
"4:" // K loop: Main loop: First: Loop
".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
- ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562aeb // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0572aab // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
"add x13, x13, #0x4\n"
- ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x15\n"
- "addvl x21, x21, #4\n"
+ "addvl x22, x22, #4\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
- ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
"add x10, %x[in], x16, LSL #3\n"
"ldr x9, [x11, #0x0]\n"
".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
- ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe05626eb // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05726ab // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
"whilelt p10.h, x14, %x[width]\n"
"inch x14\n"
- ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
- "addvl x21, x21, #4\n"
- "inch x22\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "addvl x22, x22, #4\n"
+ "inch x23\n"
"whilelt p9.h, x14, %x[width]\n"
"whilelt p8.h, x14, %x[width]\n"
"mov x13, #0x0\n"
@@ -183,61 +181,61 @@ void interleave_block<2, 2, VLType::SME, false>(
"6:" // K loop: Main loop: Second: Loop
".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562aea // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0572aaa // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
"add x13, x13, #0x4\n"
- ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x15\n"
- "addvl x21, x21, #4\n"
+ "addvl x22, x22, #4\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
"add x10, %x[in], x16, LSL #3\n"
"ldr x9, [x11, #0x0]\n"
".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
- ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe05626ea // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05726aa // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
"whilelt p10.h, x14, %x[width]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
- "addvl x21, x21, #4\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "addvl x22, x22, #4\n"
"inch x14\n"
- "inch x22\n"
+ "inch x23\n"
"bgt 3b\n"
"8:" // K loop: Tails
"cbnz x25, 11f\n"
@@ -248,51 +246,51 @@ void interleave_block<2, 2, VLType::SME, false>(
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
- "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ "ldr x21, [x11, #0x0]\n"
"add x12, x12, #0x1\n"
".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
- "ldr x26, [x11, x16, LSL #0x3]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
"cmp x12, x16\n"
- ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe05726a1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
+ ".inst 0xe0572289 // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
"add x11, x11, #0x8\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"add x13, x13, #0x2\n"
"blt 9b\n"
"whilelt p10.h, x14, %x[width]\n"
- "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"whilelt p8.h, x14, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"add x20, x20, #0x2\n"
"blt 10b\n"
- "whilelt p10.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"blt 12b\n"
"13:" // K loop: End
- "mov %x[out], x21\n"
+ "mov %x[out], x22\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
@@ -300,4 +298,4 @@ void interleave_block<2, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
index 61536d38a5..ef787c89b9 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
@@ -22,32 +22,30 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 2, VLType::SME, false>(
__fp16 * &out, const __fp16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "cnth x21\n"
- "mov x22, %x[width]\n"
- "inch x22\n"
+ "cnth x22\n"
+ "mov x21, %x[width]\n"
+ "inch x21\n"
"mov x20, %x[width]\n"
- "sub x17, x21, #0x1\n"
- "sub x22, x22, #0x1\n"
+ "sub x17, x22, #0x1\n"
+ "sub x21, x21, #0x1\n"
"ands x17, x20, x17\n"
"cntw x16\n"
- "udiv x22, x22, x21\n" // n_passes = ceildiv(width, VL<T>)
- "csel x17, x17, x21, NE\n"
- "sub x13, x22, #0x1\n"
+ "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL<T>)
+ "csel x17, x17, x22, NE\n"
+ "sub x13, x21, #0x1\n"
"add x17, x17, #0x1\n"
"sub x15, x16, #0x2\n"
- "lsl x21, %x[height], #0x1\n" // height * 2
+ "lsl x22, %x[height], #0x1\n" // height * 2
"lsl x20, x16, #0x1\n"
"mov x14, #0x0\n"
"mov x11, %x[in]\n"
@@ -57,15 +55,15 @@ void interleave_block<2, 2, VLType::SME, false>(
"cntw x27, ALL, MUL #3\n"
"ldr x26, [x10, #0x0]\n"
"lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2
- "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
+ "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"ldr x24, [x11, #0x8]\n"
"lsr x17, x17, #0x1\n"
"ptrue p13.s\n"
- "ldr x23, [x10, #0x8]\n"
- "whilelt p12.h, XZR, x21\n"
- "whilelt p11.h, x20, x21\n"
- "mov x22, %x[row_offset]\n"
- "mov x21, %x[out]\n"
+ "ldr x21, [x10, #0x8]\n"
+ "whilelt p12.h, XZR, x22\n"
+ "whilelt p11.h, x20, x22\n"
+ "mov x23, %x[row_offset]\n"
+ "mov x22, %x[out]\n"
"whilelt p10.h, x14, %x[width]\n"
"whilelt p9.h, x14, %x[width]\n"
"whilelt p8.h, x14, %x[width]\n"
@@ -76,39 +74,39 @@ void interleave_block<2, 2, VLType::SME, false>(
"1:" // K loop: Charge: Loop
".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
"ldr x26, [x10, #0x0]\n"
- ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"add x12, x12, #0x4\n"
"cmp x12, x15, LSL #1\n"
- "ldr x23, [x10, #0x8]\n"
+ "ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
"blt 1b\n"
"2:" // K loop: Charge: End
".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n"
".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n"
- ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
"mov x11, %x[in]\n"
"add x10, %x[in], x16, LSL #3\n"
- ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
"ldr x26, [x10, #0x0]\n"
- "inch x22\n"
+ "inch x23\n"
"inch x14\n"
"ldr x24, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- "ldr x23, [x10, #0x8]\n"
+ "ldr x21, [x10, #0x8]\n"
"add x10, x10, #0x10\n"
"cbz x13, 8f\n"
"mov x20, x13\n"
@@ -121,60 +119,60 @@ void interleave_block<2, 2, VLType::SME, false>(
"4:" // K loop: Main loop: First: Loop
".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
- ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562aeb // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0572aab // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
"add x13, x13, #0x4\n"
- ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x15\n"
- "addvl x21, x21, #4\n"
+ "addvl x22, x22, #4\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
- ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
"add x10, %x[in], x16, LSL #3\n"
"ldr x9, [x11, #0x0]\n"
".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
- ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe05626eb // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05726ab // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
"whilelt p10.h, x14, %x[width]\n"
"inch x14\n"
- ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
- "addvl x21, x21, #4\n"
- "inch x22\n"
+ ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "addvl x22, x22, #4\n"
+ "inch x23\n"
"whilelt p9.h, x14, %x[width]\n"
"whilelt p8.h, x14, %x[width]\n"
"mov x13, #0x0\n"
@@ -183,61 +181,61 @@ void interleave_block<2, 2, VLType::SME, false>(
"6:" // K loop: Main loop: Second: Loop
".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
"ldr x9, [x11, #0x0]\n"
- ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0562aea // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0572aaa // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x10, x10, #0x10\n"
"add x13, x13, #0x4\n"
- ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x15\n"
- "addvl x21, x21, #4\n"
+ "addvl x22, x22, #4\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n"
".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n"
- ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+ ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
"mov x11, %x[in]\n"
"add x10, %x[in], x16, LSL #3\n"
"ldr x9, [x11, #0x0]\n"
".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
- ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+ ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
"ldr x26, [x10, #0x0]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe05626ea // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+ ".inst 0xe05726aa // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
"ldr x24, [x11, #0x8]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- "ldr x23, [x10, #0x8]\n"
- ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x21, [x10, #0x8]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
"whilelt p10.h, x14, %x[width]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
- "addvl x21, x21, #4\n"
+ ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+ "addvl x22, x22, #4\n"
"inch x14\n"
- "inch x22\n"
+ "inch x23\n"
"bgt 3b\n"
"8:" // K loop: Tails
"cbnz x25, 11f\n"
@@ -248,51 +246,51 @@ void interleave_block<2, 2, VLType::SME, false>(
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
- "ldr x9, [x11, #0x0]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ "ldr x21, [x11, #0x0]\n"
"add x12, x12, #0x1\n"
".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
- "ldr x26, [x11, x16, LSL #0x3]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
"cmp x12, x16\n"
- ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
- ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+ ".inst 0xe05726a1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
+ ".inst 0xe0572289 // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
"add x11, x11, #0x8\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"add x13, x13, #0x2\n"
"blt 9b\n"
"whilelt p10.h, x14, %x[width]\n"
- "whilelt p9.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"whilelt p8.h, x14, %x[width]\n"
"mov x20, #0x0\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"add x20, x20, #0x2\n"
"blt 10b\n"
- "whilelt p10.h, x14, %x[width]\n"
+ "whilelt p8.h, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"blt 12b\n"
"13:" // K loop: End
- "mov %x[out], x21\n"
+ "mov %x[out], x22\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
@@ -300,4 +298,4 @@ void interleave_block<2, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
index 4c701cff19..905c6b41eb 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 4, VLType::SME, false>(
int8_t * &out, const int8_t * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntb x21\n"
@@ -248,13 +246,13 @@ void interleave_block<2, 4, VLType::SME, false>(
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
- "ldr x9, [x11, #0x0]\n"
+ "ldr x20, [x11, #0x0]\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
- "ldr x26, [x11, x16, LSL #0x3]\n"
+ ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
"add x12, x12, #0x1\n"
".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+ ".inst 0xe0162283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
"cmp x12, x16\n"
"add x11, x11, #0x8\n"
"addvl x21, x21, #2\n"
@@ -274,7 +272,7 @@ void interleave_block<2, 4, VLType::SME, false>(
"addvl x21, x21, #2\n"
"add x20, x20, #0x4\n"
"blt 10b\n"
- "whilelt p9.b, x14, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -296,4 +294,4 @@ void interleave_block<2, 4, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
index 25262d3db9..c5c5af20e2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 4, VLType::SME, true>(
@@ -140,23 +140,23 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
- ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
- "sdot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ "sdot z19.s, z17.b, z20.b\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+ ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x9\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "sdot z19.s, z16.b, z20.b\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
"addvl x27, x27, #4\n"
"add x13, x13, #0x8\n"
"blt 5b\n"
@@ -172,28 +172,28 @@ void interleave_block<2, 4, VLType::SME, true>(
"add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- "sdot z19.s, z16.b, z20.b\n"
- ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ "sdot z18.s, z16.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+ ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
"whilelt p9.b, x15, %x[width]\n"
".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"incb x15\n"
"add x26, x26, #0x10\n"
- "sdot z19.s, z16.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
"add x25, x25, #0x10\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
"incb x28\n"
".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
@@ -217,23 +217,23 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
- ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
- "sdot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ "sdot z19.s, z17.b, z20.b\n"
".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+ ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x9\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "sdot z19.s, z16.b, z20.b\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
"addvl x27, x27, #4\n"
"add x13, x13, #0x8\n"
"blt 7b\n"
@@ -249,28 +249,28 @@ void interleave_block<2, 4, VLType::SME, true>(
"add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- "sdot z19.s, z16.b, z20.b\n"
- ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ "sdot z18.s, z16.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+ ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
"whilelt p9.b, x15, %x[width]\n"
".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"subs x20, x20, #0x1\n"
"add x26, x26, #0x10\n"
- "sdot z19.s, z16.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
"add x25, x25, #0x10\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
"incb x15\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
@@ -286,19 +286,19 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x21, [x26, #0x0]\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
- "ldr x23, [x26, x16, LSL #0x3]\n"
- ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ "ldr x20, [x26, x16, LSL #0x3]\n"
+ ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
"add x12, x12, #0x1\n"
".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
"cmp x12, x16\n"
- "sdot z19.s, z16.b, z20.b\n"
- "sdot z18.s, z17.b, z20.b\n"
- ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
+ ".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
"add x26, x26, #0x8\n"
"addvl x27, x27, #2\n"
"add x13, x13, #0x4\n"
@@ -311,17 +311,17 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "sdot z19.s, z16.b, z20.b\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
"addvl x27, x27, #2\n"
"add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -329,13 +329,13 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "sdot z19.s, z16.b, z20.b\n"
- "sdot z18.s, z17.b, z20.b\n"
+ "sdot z19.s, z17.b, z20.b\n"
+ "sdot z18.s, z16.b, z20.b\n"
"addvl x27, x27, #2\n"
"blt 13b\n"
"14:" // K loop: End
@@ -350,4 +350,4 @@ void interleave_block<2, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
index 683a315a96..ce9a0065c7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 4, VLType::SME, false>(
uint8_t * &out, const uint8_t * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntb x21\n"
@@ -248,13 +246,13 @@ void interleave_block<2, 4, VLType::SME, false>(
".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
- "ldr x9, [x11, #0x0]\n"
+ "ldr x20, [x11, #0x0]\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
- "ldr x26, [x11, x16, LSL #0x3]\n"
+ ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
"add x12, x12, #0x1\n"
".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
- ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+ ".inst 0xe0162283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
"cmp x12, x16\n"
"add x11, x11, #0x8\n"
"addvl x21, x21, #2\n"
@@ -274,7 +272,7 @@ void interleave_block<2, 4, VLType::SME, false>(
"addvl x21, x21, #2\n"
"add x20, x20, #0x4\n"
"blt 10b\n"
- "whilelt p9.b, x14, %x[width]\n"
+ "whilelt p8.b, x14, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -296,4 +294,4 @@ void interleave_block<2, 4, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
index e7571f7da7..7805152656 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 4, VLType::SME, true>(
@@ -140,23 +140,23 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
- "udot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+ ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x9\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #4\n"
"add x13, x13, #0x8\n"
"blt 5b\n"
@@ -172,28 +172,28 @@ void interleave_block<2, 4, VLType::SME, true>(
"add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- "udot z19.s, z17.b, z20.b\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "udot z18.s, z17.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+ ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
"whilelt p9.b, x15, %x[width]\n"
".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"incb x15\n"
"add x26, x26, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
"add x25, x25, #0x10\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"incb x28\n"
".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
@@ -217,23 +217,23 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
- "udot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+ ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x9\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #4\n"
"add x13, x13, #0x8\n"
"blt 7b\n"
@@ -249,28 +249,28 @@ void interleave_block<2, 4, VLType::SME, true>(
"add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- "udot z19.s, z17.b, z20.b\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "udot z18.s, z17.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+ ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
"whilelt p9.b, x15, %x[width]\n"
".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"subs x20, x20, #0x1\n"
"add x26, x26, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
"add x25, x25, #0x10\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"incb x15\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
@@ -286,19 +286,19 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x21, [x26, #0x0]\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
- "ldr x23, [x26, x16, LSL #0x3]\n"
- ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "ldr x20, [x26, x16, LSL #0x3]\n"
+ ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
"add x12, x12, #0x1\n"
".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
"cmp x12, x16\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
- ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
+ ".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
"add x26, x26, #0x8\n"
"addvl x27, x27, #2\n"
"add x13, x13, #0x4\n"
@@ -311,17 +311,17 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #2\n"
"add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -329,13 +329,13 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #2\n"
"blt 13b\n"
"14:" // K loop: End
@@ -350,4 +350,4 @@ void interleave_block<2, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
index 522f310cc0..96ab55ee06 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 1, VLType::SME, false>(
__fp16 * &out, const __fp16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cnth x28\n"
@@ -97,4 +95,4 @@ void interleave_block<2, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
index 949e003598..ac4b1b5086 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 1, VLType::SME, false>(
float * &out, const float * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"mov x22, %x[width]\n"
@@ -55,12 +53,12 @@ void interleave_block<2, 1, VLType::SME, false>(
"ldr x25, [x11, #0x8]\n"
"and x24, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1)
"csel x15, x15, x16, NE\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x9, #0x8]\n"
"ptrue p13.s\n"
"whilelt p12.s, XZR, %x[height]\n"
"whilelt p11.s, x16, %x[height]\n"
- "mov x22, %x[row_offset]\n"
- "mov x21, %x[out]\n"
+ "mov x23, %x[row_offset]\n"
+ "mov x22, %x[out]\n"
"whilelt p10.s, x13, %x[width]\n"
"whilelt p9.s, x13, %x[width]\n"
"whilelt p8.s, x13, %x[width]\n"
@@ -71,39 +69,39 @@ void interleave_block<2, 1, VLType::SME, false>(
"1:" // K loop: Charge: Loop
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
"ldr x10, [x11, #0x0]\n"
- ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
"ldr x27, [x9, #0x0]\n"
- ".inst 0xe0960721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0970721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
"ldr x25, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe09602e5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+ ".inst 0xe09702a5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x14\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x9, #0x8]\n"
"add x9, x9, #0x10\n"
"blt 1b\n"
"2:" // K loop: Charge: End
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
- ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
"mov x11, %x[in]\n"
"add x9, %x[in], x16, LSL #3\n"
- ".inst 0xe0960721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0970721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
"ldr x10, [x11, #0x0]\n"
- ".inst 0xe09602e5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+ ".inst 0xe09702a5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
"ldr x27, [x9, #0x0]\n"
- "incw x22\n"
+ "incw x23\n"
"incw x13\n"
"ldr x25, [x11, #0x8]\n"
"add x11, x11, #0x10\n"
- "ldr x23, [x9, #0x8]\n"
+ "ldr x21, [x9, #0x8]\n"
"add x9, x9, #0x10\n"
"cbz x20, 8f\n"
"mov x20, x20\n"
@@ -115,59 +113,59 @@ void interleave_block<2, 1, VLType::SME, false>(
"4:" // K loop: Main loop: First: Loop
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+ ".inst 0xe0970548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
"ldr x10, [x11, #0x0]\n"
- ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ ".inst 0xe097036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
"ldr x27, [x9, #0x0]\n"
".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0960329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0970329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
"ldr x25, [x11, #0x8]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0960aed // ld1w { za3h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n"
- "ldr x23, [x9, #0x8]\n"
- ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0970aad // ld1w { za3h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x9, x9, #0x10\n"
- ".inst 0xe0ba82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
+ ".inst 0xe0ba82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x14\n"
- "addvl x21, x21, #4\n"
+ "addvl x22, x22, #4\n"
"blt 4b\n"
"5:" // K loop: Main loop: First: Tail
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
- ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ ".inst 0xe0970548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ ".inst 0xe097036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
"mov x11, %x[in]\n"
"add x9, %x[in], x16, LSL #3\n"
"ldr x10, [x11, #0x0]\n"
".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
- ".inst 0xe0960329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0970329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
"ldr x27, [x9, #0x0]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe09606ed // ld1w { za3h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+ ".inst 0xe09706ad // ld1w { za3h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
"ldr x25, [x11, #0x8]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- "ldr x23, [x9, #0x8]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
"whilelt p10.s, x13, %x[width]\n"
"incw x13\n"
- ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
"add x9, x9, #0x10\n"
- ".inst 0xe0ba82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
- "addvl x21, x21, #4\n"
- "incw x22\n"
+ ".inst 0xe0ba82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+ "addvl x22, x22, #4\n"
+ "incw x23\n"
"whilelt p9.s, x13, %x[width]\n"
"whilelt p8.s, x13, %x[width]\n"
"mov x12, #0x0\n"
@@ -175,60 +173,60 @@ void interleave_block<2, 1, VLType::SME, false>(
"6:" // K loop: Main loop: Second: Loop
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
"ldr x10, [x11, #0x0]\n"
- ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
"ldr x27, [x9, #0x0]\n"
".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0960321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0970321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
"ldr x25, [x11, #0x8]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0960ae5 // ld1w { za1h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n"
- "ldr x23, [x9, #0x8]\n"
- ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0970aa5 // ld1w { za1h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
"add x11, x11, #0x10\n"
- ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x9, x9, #0x10\n"
- ".inst 0xe0ba82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
+ ".inst 0xe0ba82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
"add x12, x12, #0x2\n"
"cmp x12, x14\n"
- "addvl x21, x21, #4\n"
+ "addvl x22, x22, #4\n"
"blt 6b\n"
"7:" // K loop: Main loop: Second: Tail
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
- ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+ ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
"mov x11, %x[in]\n"
"add x9, %x[in], x16, LSL #3\n"
"ldr x10, [x11, #0x0]\n"
".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
- ".inst 0xe0960321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+ ".inst 0xe0970321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
"ldr x27, [x9, #0x0]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe09606e5 // ld1w { za1h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+ ".inst 0xe09706a5 // ld1w { za1h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
"ldr x25, [x11, #0x8]\n"
".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n"
- "ldr x23, [x9, #0x8]\n"
- ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ "ldr x21, [x9, #0x8]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
- ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
"whilelt p10.s, x13, %x[width]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+ ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
"add x11, x11, #0x10\n"
"add x9, x9, #0x10\n"
- ".inst 0xe0ba82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
- "addvl x21, x21, #4\n"
+ ".inst 0xe0ba82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+ "addvl x22, x22, #4\n"
"incw x13\n"
- "incw x22\n"
+ "incw x23\n"
"bgt 3b\n"
"8:" // K loop: Tails
"cbnz x24, 11f\n"
@@ -238,48 +236,48 @@ void interleave_block<2, 1, VLType::SME, false>(
"mov x12, #0x0\n"
"9:" // K loop: Tails: Even: First
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
- "ldr x10, [x11, #0x0]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+ "ldr x21, [x11, #0x0]\n"
".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n"
".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n"
- "ldr x27, [x11, x16, LSL #0x3]\n"
- ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+ "ldr x20, [x11, x16, LSL #0x3]\n"
+ ".inst 0xe09706a8 // ld1w { za2h.s[x12] }, p1/Z, [x21, x23, LSL #2]\n"
"add x11, x11, #0x8\n"
- "addvl x21, x21, #2\n"
- ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+ "addvl x22, x22, #2\n"
+ ".inst 0xe097028c // ld1w { za3h.s[x12] }, p0/Z, [x20, x23, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x16\n"
"blt 9b\n"
"whilelt p10.s, x13, %x[width]\n"
- "whilelt p9.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
"whilelt p8.s, x13, %x[width]\n"
"mov x12, #0x0\n"
"10:" // K loop: Tails: Even: Second
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x15\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"blt 10b\n"
- "whilelt p10.s, x13, %x[width]\n"
+ "whilelt p8.s, x13, %x[width]\n"
"b 13f\n"
"11:" // K loop: Tails: Odd
"mov x12, #0x0\n"
"12:" // K loop: Tails: Odd: Loop
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+ ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n"
- ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
"add x12, x12, #0x1\n"
"cmp x12, x15\n"
- "addvl x21, x21, #2\n"
+ "addvl x22, x22, #2\n"
"blt 12b\n"
"13:" // K loop: End
- "mov %x[out], x21\n"
+ "mov %x[out], x22\n"
".inst 0xd503467f // SMSTOP\n"
: [out] "+&r" (out)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
@@ -287,4 +285,4 @@ void interleave_block<2, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
index 4cc84d344a..2e53475b5c 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<4, 2, VLType::SME, false>(
bfloat16 * &out, const bfloat16 * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x16\n"
@@ -124,4 +122,4 @@ void interleave_block<4, 2, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
index 465939c30d..67dd5a9bb7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<4, 4, VLType::SME, false>(
int8_t * &out, const int8_t * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x16\n"
@@ -123,4 +121,4 @@ void interleave_block<4, 4, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
index ffd9384a13..21d9378368 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<4, 4, VLType::SME, true>(
@@ -112,22 +112,22 @@ void interleave_block<4, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828812 // mova z18.s, p2/M, za0v.s[x12]\n"
+ ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n"
".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n"
+ ".inst 0xc0828893 // mova z19.s, p2/M, za1v.s[x12]\n"
".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n"
- "sdot z23.s, z18.b, z24.b\n"
+ "sdot z23.s, z17.b, z24.b\n"
".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
- ".inst 0xc0828993 // mova z19.s, p2/M, za3v.s[x12]\n"
+ ".inst 0xc0828992 // mova z18.s, p2/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x20\n"
- "sdot z22.s, z17.b, z24.b\n"
+ "sdot z22.s, z19.b, z24.b\n"
"sdot z21.s, z16.b, z24.b\n"
"addvl x9, x9, #4\n"
- "sdot z20.s, z19.b, z24.b\n"
+ "sdot z20.s, z18.b, z24.b\n"
"blt 5b\n"
"incb x28\n"
"whilelt p9.b, x28, %x[width]\n"
@@ -147,4 +147,4 @@ void interleave_block<4, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
index 9f5db6ba3d..f149c93293 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<4, 4, VLType::SME, false>(
uint8_t * &out, const uint8_t * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x16\n"
@@ -123,4 +121,4 @@ void interleave_block<4, 4, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
index 49d2acf1cd..252152e3da 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<4, 4, VLType::SME, true>(
@@ -112,22 +112,22 @@ void interleave_block<4, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0828813 // mova z19.s, p2/M, za0v.s[x12]\n"
+ ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n"
".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n"
".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
- ".inst 0xc0828912 // mova z18.s, p2/M, za2v.s[x12]\n"
- "udot z23.s, z19.b, z24.b\n"
+ ".inst 0xc0828913 // mova z19.s, p2/M, za2v.s[x12]\n"
+ "udot z23.s, z16.b, z24.b\n"
".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
- ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n"
+ ".inst 0xc0828992 // mova z18.s, p2/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x20\n"
"udot z22.s, z17.b, z24.b\n"
- "udot z21.s, z18.b, z24.b\n"
+ "udot z21.s, z19.b, z24.b\n"
"addvl x9, x9, #4\n"
- "udot z20.s, z16.b, z24.b\n"
+ "udot z20.s, z18.b, z24.b\n"
"blt 5b\n"
"incb x28\n"
"whilelt p9.b, x28, %x[width]\n"
@@ -147,4 +147,4 @@ void interleave_block<4, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
index 9579263204..b11bb93c42 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
@@ -22,16 +22,14 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<4, 1, VLType::SME, false>(
float * &out, const float * const *in,
- size_t width, size_t height, size_t row_offset, bool first
+ size_t width, size_t height, size_t row_offset, bool
)
{
- ARM_COMPUTE_UNUSED(first);
-
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"cntw x15\n"
@@ -123,4 +121,4 @@ void interleave_block<4, 1, VLType::SME, false>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
index 4f25da2877..b921fd16d2 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,8 +39,12 @@
*/
template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int int_by = height_vectors;
+#endif
std::vector<int32_t> the_sums;
@@ -104,8 +108,12 @@ void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
// If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
if (row_sum_multiplier) {
@@ -138,8 +146,12 @@ void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int
unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
const unsigned int k0, const unsigned int kmax, bool integrate_sums,
const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
// 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
// pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
@@ -208,8 +220,12 @@ void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
// Use alloca here as a std::vector can be expensive in highly threaded scenarios.
@@ -246,8 +262,12 @@ void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const con
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
// Use alloca here as a std::vector can be expensive in highly threaded scenarios.
const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
index 9a871d4b88..72e414969e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
index 74791f8d30..377daddae9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -231,11 +231,11 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"17:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 18f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -251,41 +251,41 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x12, #0x10]\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ "trn1 v20.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e47ee88 // bfmmla v8.4s, v20.8h, v7.8h\n"
+ "ldr q17, [x11, #0x0]\n"
+ ".inst 0x6e46ee8c // bfmmla v12.4s, v20.8h, v6.8h\n"
+ "ldr q19, [x11, #0x10]\n"
+ ".inst 0x6e51ee89 // bfmmla v9.4s, v20.8h, v17.8h\n"
+ "ldr q18, [x10, #0x0]\n"
+ ".inst 0x6e53ee8d // bfmmla v13.4s, v20.8h, v19.8h\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee8a // bfmmla v10.4s, v20.8h, v18.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ ".inst 0x6e51ee8e // bfmmla v14.4s, v20.8h, v17.8h\n"
+ "ldr q17, [x9, #0x10]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e52ee8b // bfmmla v11.4s, v20.8h, v18.8h\n"
+ "ldr q18, [x12, #0x20]\n"
+ ".inst 0x6e51ee8f // bfmmla v15.4s, v20.8h, v17.8h\n"
+ "ldr q17, [x12, #0x30]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x11, #0x20]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x11, #0x30]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x9, #0x20]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x9, #0x30]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
"add x12, x12, #0x40\n"
"ldr q7, [x12, #0x0]\n"
@@ -295,39 +295,39 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"add x9, x9, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ "trn1 v19.2d, v1.2d, v20.2d\n"
+ ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
+ "ldr q17, [x11, #0x0]\n"
+ ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
+ "ldr q18, [x11, #0x10]\n"
+ ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x0]\n"
+ ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x10]\n"
+ ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x9, #0x0]\n"
+ ".inst 0x6e52ee6e // bfmmla v14.4s, v19.8h, v18.8h\n"
+ "ldr q24, [x9, #0x10]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
+ ".inst 0x6e51ee6b // bfmmla v11.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x12, #0x20]\n"
+ ".inst 0x6e58ee6f // bfmmla v15.4s, v19.8h, v24.8h\n"
+ "ldr q17, [x12, #0x30]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q19, [x11, #0x20]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x11, #0x30]\n"
+ ".inst 0x6e53ec29 // bfmmla v9.4s, v1.8h, v19.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x9, #0x20]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x9, #0x30]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"add x26, x26, #0x10\n"
"add x12, x12, #0x40\n"
"add x11, x11, #0x40\n"
@@ -338,26 +338,26 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 24f\n"
"23:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr q6, [x12, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr q7, [x12, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
+ "ldr d19, [x26], #0x8\n"
+ "ldr q18, [x12, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
+ "ldr q17, [x12, #0x10]\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x11, #0x0]\n"
+ ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x11, #0x10]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x0]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x9, #0x10]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
"add x10, x10, #0x20\n"
@@ -373,23 +373,23 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr h1, [x26, #0x0]\n"
"26:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q7, [x12, #0x0]\n"
- "ldr q6, [x12, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ "ldr q20, [x12, #0x0]\n"
+ "ldr q18, [x12, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v17.2d\n"
+ ".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n"
+ "ldr q17, [x11, #0x0]\n"
+ ".inst 0x6e52ee6c // bfmmla v12.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x11, #0x10]\n"
+ ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x0]\n"
+ ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n"
"ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ ".inst 0x6e46ee6e // bfmmla v14.4s, v19.8h, v6.8h\n"
+ "ldr q17, [x9, #0x10]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
"add x10, x10, #0x20\n"
@@ -405,17 +405,17 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp1 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 28f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v18.4s\n"
+ "fmin v9.4s, v9.4s, v18.4s\n"
+ "fmin v10.4s, v10.4s, v18.4s\n"
+ "fmin v11.4s, v11.4s, v18.4s\n"
+ "fmax v8.4s, v8.4s, v17.4s\n"
+ "fmax v9.4s, v9.4s, v17.4s\n"
+ "fmax v10.4s, v10.4s, v17.4s\n"
+ "fmax v11.4s, v11.4s, v17.4s\n"
"28:" // Height 1: No activation
"cmp x14, #0x10\n"
"bge 37f\n"
@@ -624,12 +624,12 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"55:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 56f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -637,7 +637,7 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"b 57f\n"
"56:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"57:" // Height 2: input setup done
"cmp x27, #0x8\n"
"blt 60f\n"
@@ -648,45 +648,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x12, #0x10]\n"
"blt 59f\n"
"58:" // Height 2: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
+ "ldr q18, [x11, #0x0]\n"
+ ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
+ "ldr q17, [x11, #0x10]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x0]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x9, #0x10]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x12, #0x20]\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x12, #0x30]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x11, #0x20]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x11, #0x30]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x9, #0x20]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x9, #0x30]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
"add x12, x12, #0x40\n"
"ldr q7, [x12, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x12, #0x10]\n"
"add x11, x11, #0x40\n"
@@ -694,39 +694,39 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"add x9, x9, #0x40\n"
"bge 58b\n"
"59:" // Height 2: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
+ "ldr q18, [x11, #0x0]\n"
+ ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
+ "ldr q17, [x11, #0x10]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x0]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x9, #0x10]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x12, #0x20]\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x12, #0x30]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x11, #0x20]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x11, #0x30]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x9, #0x20]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x9, #0x30]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"add x12, x12, #0x40\n"
@@ -738,27 +738,27 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 62f\n"
"61:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x12, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q6, [x11, #0x0]\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr q17, [x12, #0x10]\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6e5aee69 // bfmmla v9.4s, v19.8h, v26.8h\n"
+ ".inst 0x6e46ee6d // bfmmla v13.4s, v19.8h, v6.8h\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ "ldr q17, [x9, #0x10]\n"
"cmp x27, #0x4\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
"add x10, x10, #0x20\n"
@@ -777,23 +777,23 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr h1, [x26, #0x0]\n"
"ldr h2, [x25, #0x0]\n"
"64:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q7, [x12, #0x0]\n"
- "ldr q6, [x12, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr q17, [x12, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x11, #0x0]\n"
+ ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x11, #0x10]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q3, [x10, #0x0]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q27, [x10, #0x10]\n"
+ ".inst 0x6e43ee6a // bfmmla v10.4s, v19.8h, v3.8h\n"
+ "ldr q18, [x9, #0x0]\n"
+ ".inst 0x6e5bee6e // bfmmla v14.4s, v19.8h, v27.8h\n"
+ "ldr q17, [x9, #0x10]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
"add x10, x10, #0x20\n"
@@ -815,25 +815,25 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp2 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 66f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v7.4s, v7.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v7.4s, v7.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "fmin v7.4s, v7.4s, v18.4s\n"
+ "fmin v12.4s, v12.4s, v18.4s\n"
+ "fmin v13.4s, v13.4s, v18.4s\n"
+ "fmin v14.4s, v14.4s, v18.4s\n"
+ "fmin v8.4s, v8.4s, v18.4s\n"
+ "fmin v9.4s, v9.4s, v18.4s\n"
+ "fmin v10.4s, v10.4s, v18.4s\n"
+ "fmin v11.4s, v11.4s, v18.4s\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "fmax v12.4s, v12.4s, v17.4s\n"
+ "fmax v13.4s, v13.4s, v17.4s\n"
+ "fmax v14.4s, v14.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v17.4s\n"
+ "fmax v9.4s, v9.4s, v17.4s\n"
+ "fmax v10.4s, v10.4s, v17.4s\n"
+ "fmax v11.4s, v11.4s, v17.4s\n"
"66:" // Height 2: No activation
"cmp x14, #0x10\n"
"bge 75f\n"
@@ -1107,13 +1107,13 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"93:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 94f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 95f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1122,8 +1122,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"b 95f\n"
"94:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"95:" // Height 3: input setup done
"cmp x27, #0x8\n"
"blt 98f\n"
@@ -1135,170 +1135,170 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x12, #0x10]\n"
"blt 97f\n"
"96:" // Height 3: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x11, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"cmp x27, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x12, #0x20]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x12, #0x30]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x11, #0x20]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
"add x12, x12, #0x40\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x11, #0x30]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x9, #0x20]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
"ldr q7, [x12, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x12, #0x10]\n"
"bge 96b\n"
"97:" // Height 3: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x11, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x12, #0x20]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x12, #0x30]\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
"add x12, x12, #0x40\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x11, #0x20]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x11, #0x30]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x9, #0x20]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"98:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 103f\n"
"cmp x27, #0x4\n"
"blt 100f\n"
"99:" // Height 3: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr q6, [x12, #0x0]\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q7, [x12, #0x10]\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr q26, [x12, #0x0]\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x11, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
"sub x27, x27, #0x4\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"cmp x27, #0x4\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"bge 99b\n"
"100:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 103f\n"
@@ -1316,36 +1316,36 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr h2, [x25, #0x0]\n"
"ldr h3, [x24, #0x0]\n"
"102:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q7, [x12, #0x0]\n"
- "ldr q6, [x12, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q29, [x12, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v25.2d\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n"
+ ".inst 0x6e5def74 // bfmmla v20.4s, v27.8h, v29.8h\n"
+ "ldr q25, [x11, #0x10]\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"103:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1368,33 +1368,33 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp1 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 104f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v7.4s, v7.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v7.4s, v7.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "fmin v7.4s, v7.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v25.4s\n"
+ "fmax v14.4s, v14.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v25.4s\n"
+ "fmax v9.4s, v9.4s, v25.4s\n"
+ "fmax v10.4s, v10.4s, v25.4s\n"
+ "fmax v11.4s, v11.4s, v25.4s\n"
+ "fmax v16.4s, v16.4s, v25.4s\n"
+ "fmax v17.4s, v17.4s, v25.4s\n"
+ "fmax v18.4s, v18.4s, v25.4s\n"
+ "fmax v19.4s, v19.4s, v25.4s\n"
"104:" // Height 3: No activation
"cmp x14, #0x10\n"
"bge 113f\n"
@@ -1709,14 +1709,14 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"131:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 132f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 133f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1726,9 +1726,9 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"b 133f\n"
"132:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"133:" // Height 4: input setup done
"cmp x27, #0x8\n"
"blt 136f\n"
@@ -1741,174 +1741,174 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x12, #0x10]\n"
"blt 135f\n"
"134:" // Height 4: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x11, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"cmp x27, #0x10\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x12, #0x20]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
"add x23, x23, #0x10\n"
"ldr q4, [x23, #0x0]\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x12, #0x30]\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x11, #0x20]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x11, #0x30]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"add x12, x12, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x9, #0x20]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
"ldr q7, [x12, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x12, #0x10]\n"
"bge 134b\n"
"135:" // Height 4: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x11, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x12, #0x20]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x12, #0x30]\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
"add x12, x12, #0x40\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x11, #0x20]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x11, #0x30]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x9, #0x20]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"136:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 141f\n"
"cmp x27, #0x4\n"
"blt 138f\n"
"137:" // Height 4: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x12, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q25, [x12, #0x10]\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x11, #0x0]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x11, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x10]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"bge 137b\n"
"138:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 141f\n"
@@ -1929,36 +1929,36 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr h3, [x24, #0x0]\n"
"ldr h4, [x23, #0x0]\n"
"140:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q7, [x12, #0x0]\n"
- "ldr q6, [x12, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q25, [x12, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x11, #0x0]\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x11, #0x10]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x0]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x10]\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x9, #0x0]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x9, #0x10]\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"141:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1986,41 +1986,41 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"uzp2 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 142f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v7.4s, v7.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v7.4s, v7.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "fmin v7.4s, v7.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v25.4s\n"
+ "fmax v14.4s, v14.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v25.4s\n"
+ "fmax v9.4s, v9.4s, v25.4s\n"
+ "fmax v10.4s, v10.4s, v25.4s\n"
+ "fmax v11.4s, v11.4s, v25.4s\n"
+ "fmax v15.4s, v15.4s, v25.4s\n"
+ "fmax v20.4s, v20.4s, v25.4s\n"
+ "fmax v21.4s, v21.4s, v25.4s\n"
+ "fmax v22.4s, v22.4s, v25.4s\n"
+ "fmax v16.4s, v16.4s, v25.4s\n"
+ "fmax v17.4s, v17.4s, v25.4s\n"
+ "fmax v18.4s, v18.4s, v25.4s\n"
+ "fmax v19.4s, v19.4s, v25.4s\n"
"142:" // Height 4: No activation
"cmp x14, #0x10\n"
"bge 151f\n"
@@ -2400,15 +2400,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"169:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 170f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 171f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2419,10 +2419,10 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"b 171f\n"
"170:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"171:" // Height 5: input setup done
"cmp x27, #0x8\n"
"blt 174f\n"
@@ -2435,170 +2435,170 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr q7, [x12, #0x0]\n"
"blt 173f\n"
"172:" // Height 5: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x12, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x12, #0x10]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
"cmp x27, #0x10\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x11, #0x10]\n"
+ ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
"add x26, x26, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x0]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
"add x22, x22, #0x10\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x12, #0x30]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
+ "ldr q6, [x11, #0x20]\n"
"add x12, x12, #0x40\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x11, #0x30]\n"
+ ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x10, #0x20]\n"
+ ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x9, #0x20]\n"
+ ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n"
"ldr q7, [x12, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
+ ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n"
"ldr q5, [x22, #0x0]\n"
"bge 172b\n"
"173:" // Height 5: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x12, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x12, #0x10]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x11, #0x10]\n"
+ ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
"add x25, x25, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x0]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x12, #0x20]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
+ ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
+ "ldr q2, [x12, #0x30]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
"add x12, x12, #0x40\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ "ldr q0, [x11, #0x20]\n"
+ ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x11, #0x30]\n"
+ ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x10, #0x30]\n"
+ ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x9, #0x20]\n"
+ ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n"
"ldr q6, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n"
".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -2608,51 +2608,51 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"blt 176f\n"
"175:" // Height 5: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d5, [x22], #0x8\n"
- "ldr q6, [x12, #0x0]\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q7, [x12, #0x10]\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ "ldr d0, [x22], #0x8\n"
+ "ldr q1, [x12, #0x0]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
+ ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
+ "ldr q0, [x12, #0x10]\n"
+ ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x11, #0x0]\n"
+ ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
"cmp x27, #0x4\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x11, #0x10]\n"
+ ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x0]\n"
+ ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n"
"ldr q6, [x9, #0x0]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n"
"bge 175b\n"
"176:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 179f\n"
@@ -2676,45 +2676,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr h4, [x23, #0x0]\n"
"ldr h5, [x22, #0x0]\n"
"178:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q7, [x12, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x12, #0x10]\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
+ "ldr q6, [x12, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ "trn1 v2.2d, v5.2d, v0.2d\n"
+ "ldr q1, [x12, #0x10]\n"
+ ".inst 0x6e46ece8 // bfmmla v8.4s, v7.8h, v6.8h\n"
+ ".inst 0x6e46ec70 // bfmmla v16.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
+ "ldr q0, [x11, #0x0]\n"
+ ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x11, #0x10]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
+ ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x0]\n"
+ ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x10]\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x9, #0x0]\n"
+ ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n"
"ldr q6, [x9, #0x10]\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n"
+ ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n"
"179:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3213,16 +3213,16 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"207:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 208f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 209f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -3234,11 +3234,11 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"b 209f\n"
"208:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"209:" // Height 6: input setup done
"cmp x27, #0x8\n"
"blt 212f\n"
@@ -3299,45 +3299,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q2, [x25, #0x0]\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
+ "ldr q0, [x12, #0x30]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
+ "ldr q6, [x11, #0x20]\n"
"add x12, x12, #0x40\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x11, #0x30]\n"
+ ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x10, #0x20]\n"
+ ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
- "ldr q6, [x9, #0x30]\n"
+ ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x9, #0x20]\n"
+ ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n"
"ldr q7, [x12, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
+ ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n"
"ldr q5, [x22, #0x0]\n"
"ldr q6, [x21, #0x0]\n"
"bge 210b\n"
@@ -3387,38 +3387,38 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x12, #0x30]\n"
+ "ldr q2, [x12, #0x30]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
"add x12, x12, #0x40\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x11, #0x30]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ "ldr q0, [x11, #0x20]\n"
+ ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x11, #0x30]\n"
+ ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n"
"add x11, x11, #0x40\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x10, #0x30]\n"
+ ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n"
"add x10, x10, #0x40\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x9, #0x20]\n"
+ ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n"
"ldr q6, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n"
".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -3428,52 +3428,52 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"blt 214f\n"
"213:" // Height 6: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x4\n"
- "ldr d5, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x12, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ "ldr d1, [x22], #0x8\n"
+ "ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q0, [x12, #0x10]\n"
+ ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x11, #0x0]\n"
+ ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x11, #0x10]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x10]\n"
+ ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x0]\n"
+ ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x10]\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n"
"ldr q6, [x9, #0x0]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
+ ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x9, #0x10]\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n"
"bge 213b\n"
"214:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 217f\n"
@@ -3500,45 +3500,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 (
"ldr h5, [x22, #0x0]\n"
"ldr h6, [x21, #0x0]\n"
"216:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x12, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x12, #0x10]\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
- "ldr q7, [x11, #0x0]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ "ldr q0, [x12, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n"
+ "trn1 v2.2d, v5.2d, v6.2d\n"
+ "ldr q1, [x12, #0x10]\n"
+ ".inst 0x6e40ec70 // bfmmla v16.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec58 // bfmmla v24.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x11, #0x0]\n"
+ ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x11, #0x10]\n"
+ ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x0]\n"
+ ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
- "ldr q7, [x9, #0x0]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x9, #0x0]\n"
+ ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n"
"ldr q6, [x9, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n"
+ ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n"
"217:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
index f7506e5123..4924b3a549 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
index 18a2db5069..8038612200 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
@@ -265,11 +265,11 @@ void a64_ffhybrid_fp16_mla_6x32 (
"24:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 25f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 26f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -286,69 +286,69 @@ void a64_ffhybrid_fp16_mla_6x32 (
"blt 28f\n"
"27:" // Height 1: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x12, #0x10]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x11, #0x10]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x12, #0x30]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x11, #0x30]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0x30]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x9, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x12, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x11, #0x40]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x9, #0x40]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x12, #0x50]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x11, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x10, #0x50]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x9, #0x50]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x12, #0x60]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x11, #0x60]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x9, #0x60]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x12, #0x70]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x11, #0x70]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr q17, [x10, #0x70]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr q16, [x9, #0x70]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"add x26, x26, #0x10\n"
"ldr q0, [x26, #0x0]\n"
"add x12, x12, #0x80\n"
@@ -360,68 +360,68 @@ void a64_ffhybrid_fp16_mla_6x32 (
"bge 27b\n"
"28:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x12, #0x10]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x11, #0x10]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x12, #0x30]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x11, #0x30]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0x30]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x9, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x12, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x11, #0x40]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x9, #0x40]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x12, #0x50]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x11, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x10, #0x50]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x9, #0x50]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x12, #0x60]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x11, #0x60]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x9, #0x60]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x12, #0x70]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x11, #0x70]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr q17, [x10, #0x70]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr q16, [x9, #0x70]\n"
"sub x27, x27, #0x8\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"add x26, x26, #0x10\n"
"add x12, x12, #0x80\n"
"add x11, x11, #0x80\n"
@@ -431,15 +431,15 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cbz x27, 31f\n"
"30:" // Height 1: Multiply loop: Odd block loop
"ldr h0, [x26], #0x2\n"
- "ldr q6, [x12, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v8.8h, v16.8h, v0.h[0]\n"
"sub x27, x27, #0x1\n"
- "ldr q7, [x11, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr q17, [x11, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v9.8h, v17.8h, v0.h[0]\n"
+ "fmla v10.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
"add x12, x12, #0x10\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
@@ -452,17 +452,17 @@ void a64_ffhybrid_fp16_mla_6x32 (
"bne 24b\n"
"tbz %x[flags], #1, 32f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v17.8h\n"
+ "fmin v9.8h, v9.8h, v17.8h\n"
+ "fmin v10.8h, v10.8h, v17.8h\n"
+ "fmin v11.8h, v11.8h, v17.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v16.8h\n"
+ "fmax v11.8h, v11.8h, v16.8h\n"
"32:" // Height 1: No activation
"cmp x14, #0x20\n"
"bge 49f\n"
@@ -778,12 +778,12 @@ void a64_ffhybrid_fp16_mla_6x32 (
"74:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 75f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 76f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -791,7 +791,7 @@ void a64_ffhybrid_fp16_mla_6x32 (
"b 76f\n"
"75:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"76:" // Height 2: input setup done
"cmp x27, #0x8\n"
"blt 79f\n"
@@ -804,233 +804,233 @@ void a64_ffhybrid_fp16_mla_6x32 (
"77:" // Height 2: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"sub x27, x27, #0x8\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x0]\n"
"cmp x27, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "fmla v14.8h, v17.8h, v1.h[0]\n"
+ "ldr q17, [x12, #0x10]\n"
"add x26, x26, #0x10\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "fmla v15.8h, v16.8h, v1.h[0]\n"
+ "ldr q16, [x11, #0x10]\n"
"add x25, x25, #0x10\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "fmla v12.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "fmla v13.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "fmla v14.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "fmla v15.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "fmla v12.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "fmla v13.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "fmla v14.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x12, #0x30]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "fmla v15.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x11, #0x30]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "fmla v12.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0x30]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "fmla v13.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x9, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "fmla v14.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x12, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "fmla v15.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x11, #0x40]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "fmla v12.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "fmla v13.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x9, #0x40]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "fmla v14.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x12, #0x50]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "fmla v15.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x11, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "fmla v12.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x10, #0x50]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "fmla v13.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x9, #0x50]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "fmla v14.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x12, #0x60]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "fmla v15.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x11, #0x60]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "fmla v12.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "fmla v13.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x9, #0x60]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "fmla v14.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x12, #0x70]\n"
"add x12, x12, #0x80\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "fmla v15.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "fmla v12.8h, v17.8h, v1.h[7]\n"
+ "ldr q17, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "fmla v13.8h, v16.8h, v1.h[7]\n"
+ "ldr q16, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v14.8h, v17.8h, v1.h[7]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v16.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 77b\n"
"78:" // Height 2: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"sub x27, x27, #0x8\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x0]\n"
"add x26, x26, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "fmla v14.8h, v17.8h, v1.h[0]\n"
+ "ldr q17, [x12, #0x10]\n"
"add x25, x25, #0x10\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "fmla v15.8h, v16.8h, v1.h[0]\n"
+ "ldr q16, [x11, #0x10]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "fmla v12.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "fmla v13.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "fmla v14.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "fmla v15.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "fmla v12.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "fmla v13.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "fmla v14.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x12, #0x30]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "fmla v15.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x11, #0x30]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "fmla v12.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0x30]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "fmla v13.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x9, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "fmla v14.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x12, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "fmla v15.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x11, #0x40]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "fmla v12.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "fmla v13.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x9, #0x40]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "fmla v14.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x12, #0x50]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "fmla v15.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x11, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "fmla v12.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x10, #0x50]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "fmla v13.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x9, #0x50]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "fmla v14.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x12, #0x60]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "fmla v15.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x11, #0x60]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "fmla v12.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "fmla v13.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x9, #0x60]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "fmla v14.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x12, #0x70]\n"
"add x12, x12, #0x80\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "fmla v15.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "fmla v12.8h, v17.8h, v1.h[7]\n"
+ "ldr q17, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "fmla v13.8h, v16.8h, v1.h[7]\n"
+ "ldr q16, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v14.8h, v17.8h, v1.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
+ "fmla v15.8h, v16.8h, v1.h[7]\n"
"79:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 81f\n"
"80:" // Height 2: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "ldr q16, [x11, #0x0]\n"
+ "fmla v8.8h, v17.8h, v1.h[0]\n"
+ "fmla v12.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "fmla v9.8h, v16.8h, v1.h[0]\n"
+ "fmla v13.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v10.8h, v17.8h, v1.h[0]\n"
+ "fmla v14.8h, v17.8h, v0.h[0]\n"
"add x12, x12, #0x10\n"
"add x11, x11, #0x10\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v11.8h, v16.8h, v1.h[0]\n"
+ "fmla v15.8h, v16.8h, v0.h[0]\n"
"add x10, x10, #0x10\n"
"add x9, x9, #0x10\n"
"cbnz x27, 80b\n"
@@ -1043,25 +1043,25 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x25, x13, x20, LSL #1\n"
"tbz %x[flags], #1, 82f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v17.8h\n"
+ "fmin v9.8h, v9.8h, v17.8h\n"
+ "fmin v10.8h, v10.8h, v17.8h\n"
+ "fmin v11.8h, v11.8h, v17.8h\n"
+ "fmin v12.8h, v12.8h, v17.8h\n"
+ "fmin v13.8h, v13.8h, v17.8h\n"
+ "fmin v14.8h, v14.8h, v17.8h\n"
+ "fmin v15.8h, v15.8h, v17.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v16.8h\n"
+ "fmax v11.8h, v11.8h, v16.8h\n"
+ "fmax v12.8h, v12.8h, v16.8h\n"
+ "fmax v13.8h, v13.8h, v16.8h\n"
+ "fmax v14.8h, v14.8h, v16.8h\n"
+ "fmax v15.8h, v15.8h, v16.8h\n"
"82:" // Height 2: No activation
"cmp x14, #0x20\n"
"bge 99f\n"
@@ -1458,13 +1458,13 @@ void a64_ffhybrid_fp16_mla_6x32 (
"124:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 125f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 126f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1473,8 +1473,8 @@ void a64_ffhybrid_fp16_mla_6x32 (
"b 126f\n"
"125:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"126:" // Height 3: input setup done
"cmp x27, #0x8\n"
"blt 129f\n"
@@ -1491,139 +1491,139 @@ void a64_ffhybrid_fp16_mla_6x32 (
"sub x27, x27, #0x8\n"
"cmp x27, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q21, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x26, x26, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q20, [x9, #0x0]\n"
"add x25, x25, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v21.8h, v0.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
"add x24, x24, #0x10\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v18.8h, v21.8h, v2.h[0]\n"
+ "ldr q21, [x12, #0x10]\n"
+ "fmla v11.8h, v20.8h, v0.h[0]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v2.h[0]\n"
+ "ldr q20, [x11, #0x10]\n"
+ "fmla v8.8h, v21.8h, v0.h[1]\n"
+ "fmla v12.8h, v21.8h, v1.h[1]\n"
+ "fmla v16.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x10]\n"
+ "fmla v9.8h, v20.8h, v0.h[1]\n"
+ "fmla v13.8h, v20.8h, v1.h[1]\n"
+ "fmla v17.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "fmla v10.8h, v21.8h, v0.h[1]\n"
+ "fmla v14.8h, v21.8h, v1.h[1]\n"
+ "fmla v18.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x12, #0x20]\n"
+ "fmla v11.8h, v20.8h, v0.h[1]\n"
+ "fmla v15.8h, v20.8h, v1.h[1]\n"
+ "fmla v19.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x11, #0x20]\n"
+ "fmla v8.8h, v21.8h, v0.h[2]\n"
+ "fmla v12.8h, v21.8h, v1.h[2]\n"
+ "fmla v16.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "fmla v9.8h, v20.8h, v0.h[2]\n"
+ "fmla v13.8h, v20.8h, v1.h[2]\n"
+ "fmla v17.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x9, #0x20]\n"
+ "fmla v10.8h, v21.8h, v0.h[2]\n"
+ "fmla v14.8h, v21.8h, v1.h[2]\n"
+ "fmla v18.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x12, #0x30]\n"
+ "fmla v11.8h, v20.8h, v0.h[2]\n"
+ "fmla v15.8h, v20.8h, v1.h[2]\n"
+ "fmla v19.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x11, #0x30]\n"
+ "fmla v8.8h, v21.8h, v0.h[3]\n"
+ "fmla v12.8h, v21.8h, v1.h[3]\n"
+ "fmla v16.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0x30]\n"
+ "fmla v9.8h, v20.8h, v0.h[3]\n"
+ "fmla v13.8h, v20.8h, v1.h[3]\n"
+ "fmla v17.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x9, #0x30]\n"
+ "fmla v10.8h, v21.8h, v0.h[3]\n"
+ "fmla v14.8h, v21.8h, v1.h[3]\n"
+ "fmla v18.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x12, #0x40]\n"
+ "fmla v11.8h, v20.8h, v0.h[3]\n"
+ "fmla v15.8h, v20.8h, v1.h[3]\n"
+ "fmla v19.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x11, #0x40]\n"
+ "fmla v8.8h, v21.8h, v0.h[4]\n"
+ "fmla v12.8h, v21.8h, v1.h[4]\n"
+ "fmla v16.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x10, #0x40]\n"
+ "fmla v9.8h, v20.8h, v0.h[4]\n"
+ "fmla v13.8h, v20.8h, v1.h[4]\n"
+ "fmla v17.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x9, #0x40]\n"
+ "fmla v10.8h, v21.8h, v0.h[4]\n"
+ "fmla v14.8h, v21.8h, v1.h[4]\n"
+ "fmla v18.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x12, #0x50]\n"
+ "fmla v11.8h, v20.8h, v0.h[4]\n"
+ "fmla v15.8h, v20.8h, v1.h[4]\n"
+ "fmla v19.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x11, #0x50]\n"
+ "fmla v8.8h, v21.8h, v0.h[5]\n"
+ "fmla v12.8h, v21.8h, v1.h[5]\n"
+ "fmla v16.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x10, #0x50]\n"
+ "fmla v9.8h, v20.8h, v0.h[5]\n"
+ "fmla v13.8h, v20.8h, v1.h[5]\n"
+ "fmla v17.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x9, #0x50]\n"
+ "fmla v10.8h, v21.8h, v0.h[5]\n"
+ "fmla v14.8h, v21.8h, v1.h[5]\n"
+ "fmla v18.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x12, #0x60]\n"
+ "fmla v11.8h, v20.8h, v0.h[5]\n"
+ "fmla v15.8h, v20.8h, v1.h[5]\n"
+ "fmla v19.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x11, #0x60]\n"
+ "fmla v8.8h, v21.8h, v0.h[6]\n"
+ "fmla v12.8h, v21.8h, v1.h[6]\n"
+ "fmla v16.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x10, #0x60]\n"
+ "fmla v9.8h, v20.8h, v0.h[6]\n"
+ "fmla v13.8h, v20.8h, v1.h[6]\n"
+ "fmla v17.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x9, #0x60]\n"
+ "fmla v10.8h, v21.8h, v0.h[6]\n"
+ "fmla v14.8h, v21.8h, v1.h[6]\n"
+ "fmla v18.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x12, #0x70]\n"
+ "fmla v11.8h, v20.8h, v0.h[6]\n"
"add x12, x12, #0x80\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v15.8h, v20.8h, v1.h[6]\n"
+ "fmla v19.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v8.8h, v21.8h, v0.h[7]\n"
+ "fmla v12.8h, v21.8h, v1.h[7]\n"
+ "fmla v16.8h, v21.8h, v2.h[7]\n"
+ "ldr q21, [x10, #0x70]\n"
+ "fmla v9.8h, v20.8h, v0.h[7]\n"
"add x10, x10, #0x80\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v13.8h, v20.8h, v1.h[7]\n"
+ "fmla v17.8h, v20.8h, v2.h[7]\n"
+ "ldr q20, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v10.8h, v21.8h, v0.h[7]\n"
+ "fmla v14.8h, v21.8h, v1.h[7]\n"
+ "fmla v18.8h, v21.8h, v2.h[7]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v20.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v20.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v19.8h, v20.8h, v2.h[7]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 127b\n"
@@ -1633,162 +1633,162 @@ void a64_ffhybrid_fp16_mla_6x32 (
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q21, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x25, x25, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q20, [x9, #0x0]\n"
"add x24, x24, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v10.8h, v21.8h, v0.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
+ "fmla v18.8h, v21.8h, v2.h[0]\n"
+ "ldr q21, [x12, #0x10]\n"
+ "fmla v11.8h, v20.8h, v0.h[0]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v2.h[0]\n"
+ "ldr q20, [x11, #0x10]\n"
+ "fmla v8.8h, v21.8h, v0.h[1]\n"
+ "fmla v12.8h, v21.8h, v1.h[1]\n"
+ "fmla v16.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x10]\n"
+ "fmla v9.8h, v20.8h, v0.h[1]\n"
+ "fmla v13.8h, v20.8h, v1.h[1]\n"
+ "fmla v17.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "fmla v10.8h, v21.8h, v0.h[1]\n"
+ "fmla v14.8h, v21.8h, v1.h[1]\n"
+ "fmla v18.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x12, #0x20]\n"
+ "fmla v11.8h, v20.8h, v0.h[1]\n"
+ "fmla v15.8h, v20.8h, v1.h[1]\n"
+ "fmla v19.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x11, #0x20]\n"
+ "fmla v8.8h, v21.8h, v0.h[2]\n"
+ "fmla v12.8h, v21.8h, v1.h[2]\n"
+ "fmla v16.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "fmla v9.8h, v20.8h, v0.h[2]\n"
+ "fmla v13.8h, v20.8h, v1.h[2]\n"
+ "fmla v17.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x9, #0x20]\n"
+ "fmla v10.8h, v21.8h, v0.h[2]\n"
+ "fmla v14.8h, v21.8h, v1.h[2]\n"
+ "fmla v18.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x12, #0x30]\n"
+ "fmla v11.8h, v20.8h, v0.h[2]\n"
+ "fmla v15.8h, v20.8h, v1.h[2]\n"
+ "fmla v19.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x11, #0x30]\n"
+ "fmla v8.8h, v21.8h, v0.h[3]\n"
+ "fmla v12.8h, v21.8h, v1.h[3]\n"
+ "fmla v16.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0x30]\n"
+ "fmla v9.8h, v20.8h, v0.h[3]\n"
+ "fmla v13.8h, v20.8h, v1.h[3]\n"
+ "fmla v17.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x9, #0x30]\n"
+ "fmla v10.8h, v21.8h, v0.h[3]\n"
+ "fmla v14.8h, v21.8h, v1.h[3]\n"
+ "fmla v18.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x12, #0x40]\n"
+ "fmla v11.8h, v20.8h, v0.h[3]\n"
+ "fmla v15.8h, v20.8h, v1.h[3]\n"
+ "fmla v19.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x11, #0x40]\n"
+ "fmla v8.8h, v21.8h, v0.h[4]\n"
+ "fmla v12.8h, v21.8h, v1.h[4]\n"
+ "fmla v16.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x10, #0x40]\n"
+ "fmla v9.8h, v20.8h, v0.h[4]\n"
+ "fmla v13.8h, v20.8h, v1.h[4]\n"
+ "fmla v17.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x9, #0x40]\n"
+ "fmla v10.8h, v21.8h, v0.h[4]\n"
+ "fmla v14.8h, v21.8h, v1.h[4]\n"
+ "fmla v18.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x12, #0x50]\n"
+ "fmla v11.8h, v20.8h, v0.h[4]\n"
+ "fmla v15.8h, v20.8h, v1.h[4]\n"
+ "fmla v19.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x11, #0x50]\n"
+ "fmla v8.8h, v21.8h, v0.h[5]\n"
+ "fmla v12.8h, v21.8h, v1.h[5]\n"
+ "fmla v16.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x10, #0x50]\n"
+ "fmla v9.8h, v20.8h, v0.h[5]\n"
+ "fmla v13.8h, v20.8h, v1.h[5]\n"
+ "fmla v17.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x9, #0x50]\n"
+ "fmla v10.8h, v21.8h, v0.h[5]\n"
+ "fmla v14.8h, v21.8h, v1.h[5]\n"
+ "fmla v18.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x12, #0x60]\n"
+ "fmla v11.8h, v20.8h, v0.h[5]\n"
+ "fmla v15.8h, v20.8h, v1.h[5]\n"
+ "fmla v19.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x11, #0x60]\n"
+ "fmla v8.8h, v21.8h, v0.h[6]\n"
+ "fmla v12.8h, v21.8h, v1.h[6]\n"
+ "fmla v16.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x10, #0x60]\n"
+ "fmla v9.8h, v20.8h, v0.h[6]\n"
+ "fmla v13.8h, v20.8h, v1.h[6]\n"
+ "fmla v17.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x9, #0x60]\n"
+ "fmla v10.8h, v21.8h, v0.h[6]\n"
+ "fmla v14.8h, v21.8h, v1.h[6]\n"
+ "fmla v18.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x12, #0x70]\n"
+ "fmla v11.8h, v20.8h, v0.h[6]\n"
"add x12, x12, #0x80\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v15.8h, v20.8h, v1.h[6]\n"
+ "fmla v19.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v8.8h, v21.8h, v0.h[7]\n"
+ "fmla v12.8h, v21.8h, v1.h[7]\n"
+ "fmla v16.8h, v21.8h, v2.h[7]\n"
+ "ldr q21, [x10, #0x70]\n"
+ "fmla v9.8h, v20.8h, v0.h[7]\n"
"add x10, x10, #0x80\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v13.8h, v20.8h, v1.h[7]\n"
+ "fmla v17.8h, v20.8h, v2.h[7]\n"
+ "ldr q20, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v10.8h, v21.8h, v0.h[7]\n"
+ "fmla v14.8h, v21.8h, v1.h[7]\n"
+ "fmla v18.8h, v21.8h, v2.h[7]\n"
+ "fmla v11.8h, v20.8h, v0.h[7]\n"
+ "fmla v15.8h, v20.8h, v1.h[7]\n"
+ "fmla v19.8h, v20.8h, v2.h[7]\n"
"129:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 131f\n"
"130:" // Height 3: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
"ldr h1, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr h2, [x24], #0x2\n"
- "ldr q6, [x12, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr h0, [x24], #0x2\n"
+ "ldr q21, [x12, #0x0]\n"
+ "fmla v8.8h, v21.8h, v2.h[0]\n"
+ "fmla v12.8h, v21.8h, v1.h[0]\n"
+ "ldr q20, [x11, #0x0]\n"
+ "fmla v16.8h, v21.8h, v0.h[0]\n"
+ "ldr q21, [x10, #0x0]\n"
+ "fmla v9.8h, v20.8h, v2.h[0]\n"
+ "fmla v13.8h, v20.8h, v1.h[0]\n"
+ "fmla v17.8h, v20.8h, v0.h[0]\n"
+ "ldr q20, [x9, #0x0]\n"
"add x12, x12, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v21.8h, v2.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v18.8h, v21.8h, v0.h[0]\n"
+ "fmla v11.8h, v20.8h, v2.h[0]\n"
"add x9, x9, #0x10\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v0.h[0]\n"
"cbnz x27, 130b\n"
"131:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1800,33 +1800,33 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 132f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v21.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
+ "ld1r { v20.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v21.8h\n"
+ "fmin v9.8h, v9.8h, v21.8h\n"
+ "fmin v10.8h, v10.8h, v21.8h\n"
+ "fmin v11.8h, v11.8h, v21.8h\n"
+ "fmin v12.8h, v12.8h, v21.8h\n"
+ "fmin v13.8h, v13.8h, v21.8h\n"
+ "fmin v14.8h, v14.8h, v21.8h\n"
+ "fmin v15.8h, v15.8h, v21.8h\n"
+ "fmin v16.8h, v16.8h, v21.8h\n"
+ "fmin v17.8h, v17.8h, v21.8h\n"
+ "fmin v18.8h, v18.8h, v21.8h\n"
+ "fmin v19.8h, v19.8h, v21.8h\n"
+ "fmax v8.8h, v8.8h, v20.8h\n"
+ "fmax v9.8h, v9.8h, v20.8h\n"
+ "fmax v10.8h, v10.8h, v20.8h\n"
+ "fmax v11.8h, v11.8h, v20.8h\n"
+ "fmax v12.8h, v12.8h, v20.8h\n"
+ "fmax v13.8h, v13.8h, v20.8h\n"
+ "fmax v14.8h, v14.8h, v20.8h\n"
+ "fmax v15.8h, v15.8h, v20.8h\n"
+ "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
+ "fmax v18.8h, v18.8h, v20.8h\n"
+ "fmax v19.8h, v19.8h, v20.8h\n"
"132:" // Height 3: No activation
"cmp x14, #0x20\n"
"bge 149f\n"
@@ -2304,14 +2304,14 @@ void a64_ffhybrid_fp16_mla_6x32 (
"174:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 175f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 176f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2321,9 +2321,9 @@ void a64_ffhybrid_fp16_mla_6x32 (
"b 176f\n"
"175:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"176:" // Height 4: input setup done
"cmp x27, #0x8\n"
"blt 179f\n"
@@ -2342,7 +2342,7 @@ void a64_ffhybrid_fp16_mla_6x32 (
"cmp x27, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"add x26, x26, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2350,164 +2350,164 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x24, x24, #0x10\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x0]\n"
"add x23, x23, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
+ "fmla v10.8h, v25.8h, v0.h[0]\n"
+ "fmla v14.8h, v25.8h, v1.h[0]\n"
+ "fmla v18.8h, v25.8h, v2.h[0]\n"
+ "fmla v22.8h, v25.8h, v3.h[0]\n"
+ "ldr q25, [x12, #0x10]\n"
+ "fmla v11.8h, v24.8h, v0.h[0]\n"
+ "fmla v15.8h, v24.8h, v1.h[0]\n"
+ "fmla v19.8h, v24.8h, v2.h[0]\n"
+ "fmla v23.8h, v24.8h, v3.h[0]\n"
+ "ldr q24, [x11, #0x10]\n"
+ "fmla v8.8h, v25.8h, v0.h[1]\n"
+ "fmla v12.8h, v25.8h, v1.h[1]\n"
+ "fmla v16.8h, v25.8h, v2.h[1]\n"
+ "fmla v20.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "fmla v9.8h, v24.8h, v0.h[1]\n"
+ "fmla v13.8h, v24.8h, v1.h[1]\n"
+ "fmla v17.8h, v24.8h, v2.h[1]\n"
+ "fmla v21.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x9, #0x10]\n"
+ "fmla v10.8h, v25.8h, v0.h[1]\n"
+ "fmla v14.8h, v25.8h, v1.h[1]\n"
+ "fmla v18.8h, v25.8h, v2.h[1]\n"
+ "fmla v22.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x12, #0x20]\n"
+ "fmla v11.8h, v24.8h, v0.h[1]\n"
+ "fmla v15.8h, v24.8h, v1.h[1]\n"
+ "fmla v19.8h, v24.8h, v2.h[1]\n"
+ "fmla v23.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x11, #0x20]\n"
+ "fmla v8.8h, v25.8h, v0.h[2]\n"
+ "fmla v12.8h, v25.8h, v1.h[2]\n"
+ "fmla v16.8h, v25.8h, v2.h[2]\n"
+ "fmla v20.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0x20]\n"
+ "fmla v9.8h, v24.8h, v0.h[2]\n"
+ "fmla v13.8h, v24.8h, v1.h[2]\n"
+ "fmla v17.8h, v24.8h, v2.h[2]\n"
+ "fmla v21.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x9, #0x20]\n"
+ "fmla v10.8h, v25.8h, v0.h[2]\n"
+ "fmla v14.8h, v25.8h, v1.h[2]\n"
+ "fmla v18.8h, v25.8h, v2.h[2]\n"
+ "fmla v22.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x12, #0x30]\n"
+ "fmla v11.8h, v24.8h, v0.h[2]\n"
+ "fmla v15.8h, v24.8h, v1.h[2]\n"
+ "fmla v19.8h, v24.8h, v2.h[2]\n"
+ "fmla v23.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x11, #0x30]\n"
+ "fmla v8.8h, v25.8h, v0.h[3]\n"
+ "fmla v12.8h, v25.8h, v1.h[3]\n"
+ "fmla v16.8h, v25.8h, v2.h[3]\n"
+ "fmla v20.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0x30]\n"
+ "fmla v9.8h, v24.8h, v0.h[3]\n"
+ "fmla v13.8h, v24.8h, v1.h[3]\n"
+ "fmla v17.8h, v24.8h, v2.h[3]\n"
+ "fmla v21.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x9, #0x30]\n"
+ "fmla v10.8h, v25.8h, v0.h[3]\n"
+ "fmla v14.8h, v25.8h, v1.h[3]\n"
+ "fmla v18.8h, v25.8h, v2.h[3]\n"
+ "fmla v22.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x12, #0x40]\n"
+ "fmla v11.8h, v24.8h, v0.h[3]\n"
+ "fmla v15.8h, v24.8h, v1.h[3]\n"
+ "fmla v19.8h, v24.8h, v2.h[3]\n"
+ "fmla v23.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x11, #0x40]\n"
+ "fmla v8.8h, v25.8h, v0.h[4]\n"
+ "fmla v12.8h, v25.8h, v1.h[4]\n"
+ "fmla v16.8h, v25.8h, v2.h[4]\n"
+ "fmla v20.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x10, #0x40]\n"
+ "fmla v9.8h, v24.8h, v0.h[4]\n"
+ "fmla v13.8h, v24.8h, v1.h[4]\n"
+ "fmla v17.8h, v24.8h, v2.h[4]\n"
+ "fmla v21.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x9, #0x40]\n"
+ "fmla v10.8h, v25.8h, v0.h[4]\n"
+ "fmla v14.8h, v25.8h, v1.h[4]\n"
+ "fmla v18.8h, v25.8h, v2.h[4]\n"
+ "fmla v22.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x12, #0x50]\n"
+ "fmla v11.8h, v24.8h, v0.h[4]\n"
+ "fmla v15.8h, v24.8h, v1.h[4]\n"
+ "fmla v19.8h, v24.8h, v2.h[4]\n"
+ "fmla v23.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x11, #0x50]\n"
+ "fmla v8.8h, v25.8h, v0.h[5]\n"
+ "fmla v12.8h, v25.8h, v1.h[5]\n"
+ "fmla v16.8h, v25.8h, v2.h[5]\n"
+ "fmla v20.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x10, #0x50]\n"
+ "fmla v9.8h, v24.8h, v0.h[5]\n"
+ "fmla v13.8h, v24.8h, v1.h[5]\n"
+ "fmla v17.8h, v24.8h, v2.h[5]\n"
+ "fmla v21.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x9, #0x50]\n"
+ "fmla v10.8h, v25.8h, v0.h[5]\n"
+ "fmla v14.8h, v25.8h, v1.h[5]\n"
+ "fmla v18.8h, v25.8h, v2.h[5]\n"
+ "fmla v22.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x12, #0x60]\n"
+ "fmla v11.8h, v24.8h, v0.h[5]\n"
+ "fmla v15.8h, v24.8h, v1.h[5]\n"
+ "fmla v19.8h, v24.8h, v2.h[5]\n"
+ "fmla v23.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x11, #0x60]\n"
+ "fmla v8.8h, v25.8h, v0.h[6]\n"
+ "fmla v12.8h, v25.8h, v1.h[6]\n"
+ "fmla v16.8h, v25.8h, v2.h[6]\n"
+ "fmla v20.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x10, #0x60]\n"
+ "fmla v9.8h, v24.8h, v0.h[6]\n"
+ "fmla v13.8h, v24.8h, v1.h[6]\n"
+ "fmla v17.8h, v24.8h, v2.h[6]\n"
+ "fmla v21.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x9, #0x60]\n"
+ "fmla v10.8h, v25.8h, v0.h[6]\n"
+ "fmla v14.8h, v25.8h, v1.h[6]\n"
+ "fmla v18.8h, v25.8h, v2.h[6]\n"
+ "fmla v22.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x12, #0x70]\n"
"add x12, x12, #0x80\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v11.8h, v24.8h, v0.h[6]\n"
+ "fmla v15.8h, v24.8h, v1.h[6]\n"
+ "fmla v19.8h, v24.8h, v2.h[6]\n"
+ "fmla v23.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
+ "fmla v8.8h, v25.8h, v0.h[7]\n"
+ "fmla v12.8h, v25.8h, v1.h[7]\n"
+ "fmla v16.8h, v25.8h, v2.h[7]\n"
+ "fmla v20.8h, v25.8h, v3.h[7]\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v9.8h, v24.8h, v0.h[7]\n"
+ "fmla v13.8h, v24.8h, v1.h[7]\n"
+ "fmla v17.8h, v24.8h, v2.h[7]\n"
+ "fmla v21.8h, v24.8h, v3.h[7]\n"
+ "ldr q24, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v10.8h, v25.8h, v0.h[7]\n"
+ "fmla v14.8h, v25.8h, v1.h[7]\n"
+ "fmla v18.8h, v25.8h, v2.h[7]\n"
+ "fmla v22.8h, v25.8h, v3.h[7]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v24.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v24.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v19.8h, v24.8h, v2.h[7]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v23.8h, v24.8h, v3.h[7]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 177b\n"
@@ -2518,7 +2518,7 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x26, x26, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"add x25, x25, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2526,191 +2526,191 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x23, x23, #0x10\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
+ "ldr q24, [x9, #0x0]\n"
+ "fmla v10.8h, v25.8h, v0.h[0]\n"
+ "fmla v14.8h, v25.8h, v1.h[0]\n"
+ "fmla v18.8h, v25.8h, v2.h[0]\n"
+ "fmla v22.8h, v25.8h, v3.h[0]\n"
+ "ldr q25, [x12, #0x10]\n"
+ "fmla v11.8h, v24.8h, v0.h[0]\n"
+ "fmla v15.8h, v24.8h, v1.h[0]\n"
+ "fmla v19.8h, v24.8h, v2.h[0]\n"
+ "fmla v23.8h, v24.8h, v3.h[0]\n"
+ "ldr q24, [x11, #0x10]\n"
+ "fmla v8.8h, v25.8h, v0.h[1]\n"
+ "fmla v12.8h, v25.8h, v1.h[1]\n"
+ "fmla v16.8h, v25.8h, v2.h[1]\n"
+ "fmla v20.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "fmla v9.8h, v24.8h, v0.h[1]\n"
+ "fmla v13.8h, v24.8h, v1.h[1]\n"
+ "fmla v17.8h, v24.8h, v2.h[1]\n"
+ "fmla v21.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x9, #0x10]\n"
+ "fmla v10.8h, v25.8h, v0.h[1]\n"
+ "fmla v14.8h, v25.8h, v1.h[1]\n"
+ "fmla v18.8h, v25.8h, v2.h[1]\n"
+ "fmla v22.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x12, #0x20]\n"
+ "fmla v11.8h, v24.8h, v0.h[1]\n"
+ "fmla v15.8h, v24.8h, v1.h[1]\n"
+ "fmla v19.8h, v24.8h, v2.h[1]\n"
+ "fmla v23.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x11, #0x20]\n"
+ "fmla v8.8h, v25.8h, v0.h[2]\n"
+ "fmla v12.8h, v25.8h, v1.h[2]\n"
+ "fmla v16.8h, v25.8h, v2.h[2]\n"
+ "fmla v20.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0x20]\n"
+ "fmla v9.8h, v24.8h, v0.h[2]\n"
+ "fmla v13.8h, v24.8h, v1.h[2]\n"
+ "fmla v17.8h, v24.8h, v2.h[2]\n"
+ "fmla v21.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x9, #0x20]\n"
+ "fmla v10.8h, v25.8h, v0.h[2]\n"
+ "fmla v14.8h, v25.8h, v1.h[2]\n"
+ "fmla v18.8h, v25.8h, v2.h[2]\n"
+ "fmla v22.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x12, #0x30]\n"
+ "fmla v11.8h, v24.8h, v0.h[2]\n"
+ "fmla v15.8h, v24.8h, v1.h[2]\n"
+ "fmla v19.8h, v24.8h, v2.h[2]\n"
+ "fmla v23.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x11, #0x30]\n"
+ "fmla v8.8h, v25.8h, v0.h[3]\n"
+ "fmla v12.8h, v25.8h, v1.h[3]\n"
+ "fmla v16.8h, v25.8h, v2.h[3]\n"
+ "fmla v20.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0x30]\n"
+ "fmla v9.8h, v24.8h, v0.h[3]\n"
+ "fmla v13.8h, v24.8h, v1.h[3]\n"
+ "fmla v17.8h, v24.8h, v2.h[3]\n"
+ "fmla v21.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x9, #0x30]\n"
+ "fmla v10.8h, v25.8h, v0.h[3]\n"
+ "fmla v14.8h, v25.8h, v1.h[3]\n"
+ "fmla v18.8h, v25.8h, v2.h[3]\n"
+ "fmla v22.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x12, #0x40]\n"
+ "fmla v11.8h, v24.8h, v0.h[3]\n"
+ "fmla v15.8h, v24.8h, v1.h[3]\n"
+ "fmla v19.8h, v24.8h, v2.h[3]\n"
+ "fmla v23.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x11, #0x40]\n"
+ "fmla v8.8h, v25.8h, v0.h[4]\n"
+ "fmla v12.8h, v25.8h, v1.h[4]\n"
+ "fmla v16.8h, v25.8h, v2.h[4]\n"
+ "fmla v20.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x10, #0x40]\n"
+ "fmla v9.8h, v24.8h, v0.h[4]\n"
+ "fmla v13.8h, v24.8h, v1.h[4]\n"
+ "fmla v17.8h, v24.8h, v2.h[4]\n"
+ "fmla v21.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x9, #0x40]\n"
+ "fmla v10.8h, v25.8h, v0.h[4]\n"
+ "fmla v14.8h, v25.8h, v1.h[4]\n"
+ "fmla v18.8h, v25.8h, v2.h[4]\n"
+ "fmla v22.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x12, #0x50]\n"
+ "fmla v11.8h, v24.8h, v0.h[4]\n"
+ "fmla v15.8h, v24.8h, v1.h[4]\n"
+ "fmla v19.8h, v24.8h, v2.h[4]\n"
+ "fmla v23.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x11, #0x50]\n"
+ "fmla v8.8h, v25.8h, v0.h[5]\n"
+ "fmla v12.8h, v25.8h, v1.h[5]\n"
+ "fmla v16.8h, v25.8h, v2.h[5]\n"
+ "fmla v20.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x10, #0x50]\n"
+ "fmla v9.8h, v24.8h, v0.h[5]\n"
+ "fmla v13.8h, v24.8h, v1.h[5]\n"
+ "fmla v17.8h, v24.8h, v2.h[5]\n"
+ "fmla v21.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x9, #0x50]\n"
+ "fmla v10.8h, v25.8h, v0.h[5]\n"
+ "fmla v14.8h, v25.8h, v1.h[5]\n"
+ "fmla v18.8h, v25.8h, v2.h[5]\n"
+ "fmla v22.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x12, #0x60]\n"
+ "fmla v11.8h, v24.8h, v0.h[5]\n"
+ "fmla v15.8h, v24.8h, v1.h[5]\n"
+ "fmla v19.8h, v24.8h, v2.h[5]\n"
+ "fmla v23.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x11, #0x60]\n"
+ "fmla v8.8h, v25.8h, v0.h[6]\n"
+ "fmla v12.8h, v25.8h, v1.h[6]\n"
+ "fmla v16.8h, v25.8h, v2.h[6]\n"
+ "fmla v20.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x10, #0x60]\n"
+ "fmla v9.8h, v24.8h, v0.h[6]\n"
+ "fmla v13.8h, v24.8h, v1.h[6]\n"
+ "fmla v17.8h, v24.8h, v2.h[6]\n"
+ "fmla v21.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x9, #0x60]\n"
+ "fmla v10.8h, v25.8h, v0.h[6]\n"
+ "fmla v14.8h, v25.8h, v1.h[6]\n"
+ "fmla v18.8h, v25.8h, v2.h[6]\n"
+ "fmla v22.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x12, #0x70]\n"
"add x12, x12, #0x80\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v11.8h, v24.8h, v0.h[6]\n"
+ "fmla v15.8h, v24.8h, v1.h[6]\n"
+ "fmla v19.8h, v24.8h, v2.h[6]\n"
+ "fmla v23.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
+ "fmla v8.8h, v25.8h, v0.h[7]\n"
+ "fmla v12.8h, v25.8h, v1.h[7]\n"
+ "fmla v16.8h, v25.8h, v2.h[7]\n"
+ "fmla v20.8h, v25.8h, v3.h[7]\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v9.8h, v24.8h, v0.h[7]\n"
+ "fmla v13.8h, v24.8h, v1.h[7]\n"
+ "fmla v17.8h, v24.8h, v2.h[7]\n"
+ "fmla v21.8h, v24.8h, v3.h[7]\n"
+ "ldr q24, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v10.8h, v25.8h, v0.h[7]\n"
+ "fmla v14.8h, v25.8h, v1.h[7]\n"
+ "fmla v18.8h, v25.8h, v2.h[7]\n"
+ "fmla v22.8h, v25.8h, v3.h[7]\n"
+ "fmla v11.8h, v24.8h, v0.h[7]\n"
+ "fmla v15.8h, v24.8h, v1.h[7]\n"
+ "fmla v19.8h, v24.8h, v2.h[7]\n"
+ "fmla v23.8h, v24.8h, v3.h[7]\n"
"179:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 181f\n"
"180:" // Height 4: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h2, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr q25, [x12, #0x0]\n"
+ "ldr q24, [x11, #0x0]\n"
+ "fmla v8.8h, v25.8h, v3.h[0]\n"
+ "fmla v12.8h, v25.8h, v2.h[0]\n"
+ "fmla v16.8h, v25.8h, v1.h[0]\n"
+ "fmla v20.8h, v25.8h, v0.h[0]\n"
+ "ldr q25, [x10, #0x0]\n"
"add x12, x12, #0x10\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v9.8h, v24.8h, v3.h[0]\n"
+ "fmla v13.8h, v24.8h, v2.h[0]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "fmla v17.8h, v24.8h, v1.h[0]\n"
+ "fmla v21.8h, v24.8h, v0.h[0]\n"
+ "ldr q24, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v10.8h, v25.8h, v3.h[0]\n"
+ "fmla v14.8h, v25.8h, v2.h[0]\n"
+ "fmla v18.8h, v25.8h, v1.h[0]\n"
+ "fmla v22.8h, v25.8h, v0.h[0]\n"
+ "fmla v11.8h, v24.8h, v3.h[0]\n"
+ "fmla v15.8h, v24.8h, v2.h[0]\n"
+ "fmla v19.8h, v24.8h, v1.h[0]\n"
+ "fmla v23.8h, v24.8h, v0.h[0]\n"
"cbnz x27, 180b\n"
"181:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2723,41 +2723,41 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 182f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmin v20.8h, v20.8h, v1.8h\n"
- "fmin v21.8h, v21.8h, v1.8h\n"
- "fmin v22.8h, v22.8h, v1.8h\n"
- "fmin v23.8h, v23.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
- "fmax v20.8h, v20.8h, v0.8h\n"
- "fmax v21.8h, v21.8h, v0.8h\n"
- "fmax v22.8h, v22.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v0.8h\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v25.8h\n"
+ "fmin v9.8h, v9.8h, v25.8h\n"
+ "fmin v10.8h, v10.8h, v25.8h\n"
+ "fmin v11.8h, v11.8h, v25.8h\n"
+ "fmin v12.8h, v12.8h, v25.8h\n"
+ "fmin v13.8h, v13.8h, v25.8h\n"
+ "fmin v14.8h, v14.8h, v25.8h\n"
+ "fmin v15.8h, v15.8h, v25.8h\n"
+ "fmin v16.8h, v16.8h, v25.8h\n"
+ "fmin v17.8h, v17.8h, v25.8h\n"
+ "fmin v18.8h, v18.8h, v25.8h\n"
+ "fmin v19.8h, v19.8h, v25.8h\n"
+ "fmin v20.8h, v20.8h, v25.8h\n"
+ "fmin v21.8h, v21.8h, v25.8h\n"
+ "fmin v22.8h, v22.8h, v25.8h\n"
+ "fmin v23.8h, v23.8h, v25.8h\n"
+ "fmax v8.8h, v8.8h, v24.8h\n"
+ "fmax v9.8h, v9.8h, v24.8h\n"
+ "fmax v10.8h, v10.8h, v24.8h\n"
+ "fmax v11.8h, v11.8h, v24.8h\n"
+ "fmax v12.8h, v12.8h, v24.8h\n"
+ "fmax v13.8h, v13.8h, v24.8h\n"
+ "fmax v14.8h, v14.8h, v24.8h\n"
+ "fmax v15.8h, v15.8h, v24.8h\n"
+ "fmax v16.8h, v16.8h, v24.8h\n"
+ "fmax v17.8h, v17.8h, v24.8h\n"
+ "fmax v18.8h, v18.8h, v24.8h\n"
+ "fmax v19.8h, v19.8h, v24.8h\n"
+ "fmax v20.8h, v20.8h, v24.8h\n"
+ "fmax v21.8h, v21.8h, v24.8h\n"
+ "fmax v22.8h, v22.8h, v24.8h\n"
+ "fmax v23.8h, v23.8h, v24.8h\n"
"182:" // Height 4: No activation
"cmp x14, #0x20\n"
"bge 199f\n"
@@ -3316,15 +3316,15 @@ void a64_ffhybrid_fp16_mla_6x32 (
"224:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 225f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 226f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -3335,10 +3335,10 @@ void a64_ffhybrid_fp16_mla_6x32 (
"b 226f\n"
"225:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"226:" // Height 5: input setup done
"cmp x27, #0x8\n"
"blt 229f\n"
@@ -3361,7 +3361,7 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x24, x24, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3370,194 +3370,194 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x22, x22, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q28, [x9, #0x0]\n"
+ "fmla v10.8h, v29.8h, v0.h[0]\n"
+ "fmla v14.8h, v29.8h, v1.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v3.h[0]\n"
+ "fmla v26.8h, v29.8h, v4.h[0]\n"
+ "ldr q29, [x12, #0x10]\n"
+ "fmla v11.8h, v28.8h, v0.h[0]\n"
+ "fmla v15.8h, v28.8h, v1.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v3.h[0]\n"
+ "fmla v27.8h, v28.8h, v4.h[0]\n"
+ "ldr q28, [x11, #0x10]\n"
+ "fmla v8.8h, v29.8h, v0.h[1]\n"
+ "fmla v12.8h, v29.8h, v1.h[1]\n"
+ "fmla v16.8h, v29.8h, v2.h[1]\n"
+ "fmla v20.8h, v29.8h, v3.h[1]\n"
+ "fmla v24.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "fmla v9.8h, v28.8h, v0.h[1]\n"
+ "fmla v13.8h, v28.8h, v1.h[1]\n"
+ "fmla v17.8h, v28.8h, v2.h[1]\n"
+ "fmla v21.8h, v28.8h, v3.h[1]\n"
+ "fmla v25.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x9, #0x10]\n"
+ "fmla v10.8h, v29.8h, v0.h[1]\n"
+ "fmla v14.8h, v29.8h, v1.h[1]\n"
+ "fmla v18.8h, v29.8h, v2.h[1]\n"
+ "fmla v22.8h, v29.8h, v3.h[1]\n"
+ "fmla v26.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x12, #0x20]\n"
+ "fmla v11.8h, v28.8h, v0.h[1]\n"
+ "fmla v15.8h, v28.8h, v1.h[1]\n"
+ "fmla v19.8h, v28.8h, v2.h[1]\n"
+ "fmla v23.8h, v28.8h, v3.h[1]\n"
+ "fmla v27.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x11, #0x20]\n"
+ "fmla v8.8h, v29.8h, v0.h[2]\n"
+ "fmla v12.8h, v29.8h, v1.h[2]\n"
+ "fmla v16.8h, v29.8h, v2.h[2]\n"
+ "fmla v20.8h, v29.8h, v3.h[2]\n"
+ "fmla v24.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "fmla v9.8h, v28.8h, v0.h[2]\n"
+ "fmla v13.8h, v28.8h, v1.h[2]\n"
+ "fmla v17.8h, v28.8h, v2.h[2]\n"
+ "fmla v21.8h, v28.8h, v3.h[2]\n"
+ "fmla v25.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x9, #0x20]\n"
+ "fmla v10.8h, v29.8h, v0.h[2]\n"
+ "fmla v14.8h, v29.8h, v1.h[2]\n"
+ "fmla v18.8h, v29.8h, v2.h[2]\n"
+ "fmla v22.8h, v29.8h, v3.h[2]\n"
+ "fmla v26.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x12, #0x30]\n"
+ "fmla v11.8h, v28.8h, v0.h[2]\n"
+ "fmla v15.8h, v28.8h, v1.h[2]\n"
+ "fmla v19.8h, v28.8h, v2.h[2]\n"
+ "fmla v23.8h, v28.8h, v3.h[2]\n"
+ "fmla v27.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x11, #0x30]\n"
+ "fmla v8.8h, v29.8h, v0.h[3]\n"
+ "fmla v12.8h, v29.8h, v1.h[3]\n"
+ "fmla v16.8h, v29.8h, v2.h[3]\n"
+ "fmla v20.8h, v29.8h, v3.h[3]\n"
+ "fmla v24.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0x30]\n"
+ "fmla v9.8h, v28.8h, v0.h[3]\n"
+ "fmla v13.8h, v28.8h, v1.h[3]\n"
+ "fmla v17.8h, v28.8h, v2.h[3]\n"
+ "fmla v21.8h, v28.8h, v3.h[3]\n"
+ "fmla v25.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x9, #0x30]\n"
+ "fmla v10.8h, v29.8h, v0.h[3]\n"
+ "fmla v14.8h, v29.8h, v1.h[3]\n"
+ "fmla v18.8h, v29.8h, v2.h[3]\n"
+ "fmla v22.8h, v29.8h, v3.h[3]\n"
+ "fmla v26.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x12, #0x40]\n"
+ "fmla v11.8h, v28.8h, v0.h[3]\n"
+ "fmla v15.8h, v28.8h, v1.h[3]\n"
+ "fmla v19.8h, v28.8h, v2.h[3]\n"
+ "fmla v23.8h, v28.8h, v3.h[3]\n"
+ "fmla v27.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x11, #0x40]\n"
+ "fmla v8.8h, v29.8h, v0.h[4]\n"
+ "fmla v12.8h, v29.8h, v1.h[4]\n"
+ "fmla v16.8h, v29.8h, v2.h[4]\n"
+ "fmla v20.8h, v29.8h, v3.h[4]\n"
+ "fmla v24.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x10, #0x40]\n"
+ "fmla v9.8h, v28.8h, v0.h[4]\n"
+ "fmla v13.8h, v28.8h, v1.h[4]\n"
+ "fmla v17.8h, v28.8h, v2.h[4]\n"
+ "fmla v21.8h, v28.8h, v3.h[4]\n"
+ "fmla v25.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x9, #0x40]\n"
+ "fmla v10.8h, v29.8h, v0.h[4]\n"
+ "fmla v14.8h, v29.8h, v1.h[4]\n"
+ "fmla v18.8h, v29.8h, v2.h[4]\n"
+ "fmla v22.8h, v29.8h, v3.h[4]\n"
+ "fmla v26.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x12, #0x50]\n"
+ "fmla v11.8h, v28.8h, v0.h[4]\n"
+ "fmla v15.8h, v28.8h, v1.h[4]\n"
+ "fmla v19.8h, v28.8h, v2.h[4]\n"
+ "fmla v23.8h, v28.8h, v3.h[4]\n"
+ "fmla v27.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x11, #0x50]\n"
+ "fmla v8.8h, v29.8h, v0.h[5]\n"
+ "fmla v12.8h, v29.8h, v1.h[5]\n"
+ "fmla v16.8h, v29.8h, v2.h[5]\n"
+ "fmla v20.8h, v29.8h, v3.h[5]\n"
+ "fmla v24.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x10, #0x50]\n"
+ "fmla v9.8h, v28.8h, v0.h[5]\n"
+ "fmla v13.8h, v28.8h, v1.h[5]\n"
+ "fmla v17.8h, v28.8h, v2.h[5]\n"
+ "fmla v21.8h, v28.8h, v3.h[5]\n"
+ "fmla v25.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x9, #0x50]\n"
+ "fmla v10.8h, v29.8h, v0.h[5]\n"
+ "fmla v14.8h, v29.8h, v1.h[5]\n"
+ "fmla v18.8h, v29.8h, v2.h[5]\n"
+ "fmla v22.8h, v29.8h, v3.h[5]\n"
+ "fmla v26.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x12, #0x60]\n"
+ "fmla v11.8h, v28.8h, v0.h[5]\n"
+ "fmla v15.8h, v28.8h, v1.h[5]\n"
+ "fmla v19.8h, v28.8h, v2.h[5]\n"
+ "fmla v23.8h, v28.8h, v3.h[5]\n"
+ "fmla v27.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x11, #0x60]\n"
+ "fmla v8.8h, v29.8h, v0.h[6]\n"
+ "fmla v12.8h, v29.8h, v1.h[6]\n"
+ "fmla v16.8h, v29.8h, v2.h[6]\n"
+ "fmla v20.8h, v29.8h, v3.h[6]\n"
+ "fmla v24.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x10, #0x60]\n"
+ "fmla v9.8h, v28.8h, v0.h[6]\n"
+ "fmla v13.8h, v28.8h, v1.h[6]\n"
+ "fmla v17.8h, v28.8h, v2.h[6]\n"
+ "fmla v21.8h, v28.8h, v3.h[6]\n"
+ "fmla v25.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x9, #0x60]\n"
+ "fmla v10.8h, v29.8h, v0.h[6]\n"
+ "fmla v14.8h, v29.8h, v1.h[6]\n"
+ "fmla v18.8h, v29.8h, v2.h[6]\n"
+ "fmla v22.8h, v29.8h, v3.h[6]\n"
+ "fmla v26.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x12, #0x70]\n"
+ "fmla v11.8h, v28.8h, v0.h[6]\n"
"add x12, x12, #0x80\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v15.8h, v28.8h, v1.h[6]\n"
+ "fmla v19.8h, v28.8h, v2.h[6]\n"
+ "fmla v23.8h, v28.8h, v3.h[6]\n"
+ "fmla v27.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v8.8h, v29.8h, v0.h[7]\n"
+ "fmla v12.8h, v29.8h, v1.h[7]\n"
+ "fmla v16.8h, v29.8h, v2.h[7]\n"
+ "fmla v20.8h, v29.8h, v3.h[7]\n"
+ "fmla v24.8h, v29.8h, v4.h[7]\n"
+ "ldr q29, [x10, #0x70]\n"
+ "fmla v9.8h, v28.8h, v0.h[7]\n"
"add x10, x10, #0x80\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v13.8h, v28.8h, v1.h[7]\n"
+ "fmla v17.8h, v28.8h, v2.h[7]\n"
+ "fmla v21.8h, v28.8h, v3.h[7]\n"
+ "fmla v25.8h, v28.8h, v4.h[7]\n"
+ "ldr q28, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v10.8h, v29.8h, v0.h[7]\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "fmla v18.8h, v29.8h, v2.h[7]\n"
+ "fmla v22.8h, v29.8h, v3.h[7]\n"
+ "fmla v26.8h, v29.8h, v4.h[7]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v28.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v28.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v19.8h, v28.8h, v2.h[7]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v23.8h, v28.8h, v3.h[7]\n"
"ldr q3, [x23, #0x0]\n"
- "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v27.8h, v28.8h, v4.h[7]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 227b\n"
@@ -3571,7 +3571,7 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x0]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x23, x23, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3579,226 +3579,226 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x22, x22, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x9, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x12, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x11, #0x40]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x9, #0x40]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x12, #0x50]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x11, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x10, #0x50]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x9, #0x50]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x12, #0x60]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x11, #0x60]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x9, #0x60]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x12, #0x70]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q28, [x9, #0x0]\n"
+ "fmla v10.8h, v29.8h, v0.h[0]\n"
+ "fmla v14.8h, v29.8h, v1.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v3.h[0]\n"
+ "fmla v26.8h, v29.8h, v4.h[0]\n"
+ "ldr q29, [x12, #0x10]\n"
+ "fmla v11.8h, v28.8h, v0.h[0]\n"
+ "fmla v15.8h, v28.8h, v1.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v3.h[0]\n"
+ "fmla v27.8h, v28.8h, v4.h[0]\n"
+ "ldr q28, [x11, #0x10]\n"
+ "fmla v8.8h, v29.8h, v0.h[1]\n"
+ "fmla v12.8h, v29.8h, v1.h[1]\n"
+ "fmla v16.8h, v29.8h, v2.h[1]\n"
+ "fmla v20.8h, v29.8h, v3.h[1]\n"
+ "fmla v24.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "fmla v9.8h, v28.8h, v0.h[1]\n"
+ "fmla v13.8h, v28.8h, v1.h[1]\n"
+ "fmla v17.8h, v28.8h, v2.h[1]\n"
+ "fmla v21.8h, v28.8h, v3.h[1]\n"
+ "fmla v25.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x9, #0x10]\n"
+ "fmla v10.8h, v29.8h, v0.h[1]\n"
+ "fmla v14.8h, v29.8h, v1.h[1]\n"
+ "fmla v18.8h, v29.8h, v2.h[1]\n"
+ "fmla v22.8h, v29.8h, v3.h[1]\n"
+ "fmla v26.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x12, #0x20]\n"
+ "fmla v11.8h, v28.8h, v0.h[1]\n"
+ "fmla v15.8h, v28.8h, v1.h[1]\n"
+ "fmla v19.8h, v28.8h, v2.h[1]\n"
+ "fmla v23.8h, v28.8h, v3.h[1]\n"
+ "fmla v27.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x11, #0x20]\n"
+ "fmla v8.8h, v29.8h, v0.h[2]\n"
+ "fmla v12.8h, v29.8h, v1.h[2]\n"
+ "fmla v16.8h, v29.8h, v2.h[2]\n"
+ "fmla v20.8h, v29.8h, v3.h[2]\n"
+ "fmla v24.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "fmla v9.8h, v28.8h, v0.h[2]\n"
+ "fmla v13.8h, v28.8h, v1.h[2]\n"
+ "fmla v17.8h, v28.8h, v2.h[2]\n"
+ "fmla v21.8h, v28.8h, v3.h[2]\n"
+ "fmla v25.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x9, #0x20]\n"
+ "fmla v10.8h, v29.8h, v0.h[2]\n"
+ "fmla v14.8h, v29.8h, v1.h[2]\n"
+ "fmla v18.8h, v29.8h, v2.h[2]\n"
+ "fmla v22.8h, v29.8h, v3.h[2]\n"
+ "fmla v26.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x12, #0x30]\n"
+ "fmla v11.8h, v28.8h, v0.h[2]\n"
+ "fmla v15.8h, v28.8h, v1.h[2]\n"
+ "fmla v19.8h, v28.8h, v2.h[2]\n"
+ "fmla v23.8h, v28.8h, v3.h[2]\n"
+ "fmla v27.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x11, #0x30]\n"
+ "fmla v8.8h, v29.8h, v0.h[3]\n"
+ "fmla v12.8h, v29.8h, v1.h[3]\n"
+ "fmla v16.8h, v29.8h, v2.h[3]\n"
+ "fmla v20.8h, v29.8h, v3.h[3]\n"
+ "fmla v24.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0x30]\n"
+ "fmla v9.8h, v28.8h, v0.h[3]\n"
+ "fmla v13.8h, v28.8h, v1.h[3]\n"
+ "fmla v17.8h, v28.8h, v2.h[3]\n"
+ "fmla v21.8h, v28.8h, v3.h[3]\n"
+ "fmla v25.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x9, #0x30]\n"
+ "fmla v10.8h, v29.8h, v0.h[3]\n"
+ "fmla v14.8h, v29.8h, v1.h[3]\n"
+ "fmla v18.8h, v29.8h, v2.h[3]\n"
+ "fmla v22.8h, v29.8h, v3.h[3]\n"
+ "fmla v26.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x12, #0x40]\n"
+ "fmla v11.8h, v28.8h, v0.h[3]\n"
+ "fmla v15.8h, v28.8h, v1.h[3]\n"
+ "fmla v19.8h, v28.8h, v2.h[3]\n"
+ "fmla v23.8h, v28.8h, v3.h[3]\n"
+ "fmla v27.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x11, #0x40]\n"
+ "fmla v8.8h, v29.8h, v0.h[4]\n"
+ "fmla v12.8h, v29.8h, v1.h[4]\n"
+ "fmla v16.8h, v29.8h, v2.h[4]\n"
+ "fmla v20.8h, v29.8h, v3.h[4]\n"
+ "fmla v24.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x10, #0x40]\n"
+ "fmla v9.8h, v28.8h, v0.h[4]\n"
+ "fmla v13.8h, v28.8h, v1.h[4]\n"
+ "fmla v17.8h, v28.8h, v2.h[4]\n"
+ "fmla v21.8h, v28.8h, v3.h[4]\n"
+ "fmla v25.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x9, #0x40]\n"
+ "fmla v10.8h, v29.8h, v0.h[4]\n"
+ "fmla v14.8h, v29.8h, v1.h[4]\n"
+ "fmla v18.8h, v29.8h, v2.h[4]\n"
+ "fmla v22.8h, v29.8h, v3.h[4]\n"
+ "fmla v26.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x12, #0x50]\n"
+ "fmla v11.8h, v28.8h, v0.h[4]\n"
+ "fmla v15.8h, v28.8h, v1.h[4]\n"
+ "fmla v19.8h, v28.8h, v2.h[4]\n"
+ "fmla v23.8h, v28.8h, v3.h[4]\n"
+ "fmla v27.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x11, #0x50]\n"
+ "fmla v8.8h, v29.8h, v0.h[5]\n"
+ "fmla v12.8h, v29.8h, v1.h[5]\n"
+ "fmla v16.8h, v29.8h, v2.h[5]\n"
+ "fmla v20.8h, v29.8h, v3.h[5]\n"
+ "fmla v24.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x10, #0x50]\n"
+ "fmla v9.8h, v28.8h, v0.h[5]\n"
+ "fmla v13.8h, v28.8h, v1.h[5]\n"
+ "fmla v17.8h, v28.8h, v2.h[5]\n"
+ "fmla v21.8h, v28.8h, v3.h[5]\n"
+ "fmla v25.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x9, #0x50]\n"
+ "fmla v10.8h, v29.8h, v0.h[5]\n"
+ "fmla v14.8h, v29.8h, v1.h[5]\n"
+ "fmla v18.8h, v29.8h, v2.h[5]\n"
+ "fmla v22.8h, v29.8h, v3.h[5]\n"
+ "fmla v26.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x12, #0x60]\n"
+ "fmla v11.8h, v28.8h, v0.h[5]\n"
+ "fmla v15.8h, v28.8h, v1.h[5]\n"
+ "fmla v19.8h, v28.8h, v2.h[5]\n"
+ "fmla v23.8h, v28.8h, v3.h[5]\n"
+ "fmla v27.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x11, #0x60]\n"
+ "fmla v8.8h, v29.8h, v0.h[6]\n"
+ "fmla v12.8h, v29.8h, v1.h[6]\n"
+ "fmla v16.8h, v29.8h, v2.h[6]\n"
+ "fmla v20.8h, v29.8h, v3.h[6]\n"
+ "fmla v24.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x10, #0x60]\n"
+ "fmla v9.8h, v28.8h, v0.h[6]\n"
+ "fmla v13.8h, v28.8h, v1.h[6]\n"
+ "fmla v17.8h, v28.8h, v2.h[6]\n"
+ "fmla v21.8h, v28.8h, v3.h[6]\n"
+ "fmla v25.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x9, #0x60]\n"
+ "fmla v10.8h, v29.8h, v0.h[6]\n"
+ "fmla v14.8h, v29.8h, v1.h[6]\n"
+ "fmla v18.8h, v29.8h, v2.h[6]\n"
+ "fmla v22.8h, v29.8h, v3.h[6]\n"
+ "fmla v26.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x12, #0x70]\n"
+ "fmla v11.8h, v28.8h, v0.h[6]\n"
"add x12, x12, #0x80\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x11, #0x70]\n"
+ "fmla v15.8h, v28.8h, v1.h[6]\n"
+ "fmla v19.8h, v28.8h, v2.h[6]\n"
+ "fmla v23.8h, v28.8h, v3.h[6]\n"
+ "fmla v27.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x11, #0x70]\n"
"add x11, x11, #0x80\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x10, #0x70]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v8.8h, v29.8h, v0.h[7]\n"
+ "fmla v12.8h, v29.8h, v1.h[7]\n"
+ "fmla v16.8h, v29.8h, v2.h[7]\n"
+ "fmla v20.8h, v29.8h, v3.h[7]\n"
+ "fmla v24.8h, v29.8h, v4.h[7]\n"
+ "ldr q29, [x10, #0x70]\n"
+ "fmla v9.8h, v28.8h, v0.h[7]\n"
"add x10, x10, #0x80\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x9, #0x70]\n"
+ "fmla v13.8h, v28.8h, v1.h[7]\n"
+ "fmla v17.8h, v28.8h, v2.h[7]\n"
+ "fmla v21.8h, v28.8h, v3.h[7]\n"
+ "fmla v25.8h, v28.8h, v4.h[7]\n"
+ "ldr q28, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v26.8h, v6.8h, v4.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
- "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v10.8h, v29.8h, v0.h[7]\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "fmla v18.8h, v29.8h, v2.h[7]\n"
+ "fmla v22.8h, v29.8h, v3.h[7]\n"
+ "fmla v26.8h, v29.8h, v4.h[7]\n"
+ "fmla v11.8h, v28.8h, v0.h[7]\n"
+ "fmla v15.8h, v28.8h, v1.h[7]\n"
+ "fmla v19.8h, v28.8h, v2.h[7]\n"
+ "fmla v23.8h, v28.8h, v3.h[7]\n"
+ "fmla v27.8h, v28.8h, v4.h[7]\n"
"229:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 231f\n"
"230:" // Height 5: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h4, [x26], #0x2\n"
+ "ldr h3, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
"ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "ldr q6, [x12, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "ldr h1, [x23], #0x2\n"
+ "ldr h0, [x22], #0x2\n"
+ "ldr q29, [x12, #0x0]\n"
+ "fmla v8.8h, v29.8h, v4.h[0]\n"
+ "fmla v12.8h, v29.8h, v3.h[0]\n"
+ "ldr q28, [x11, #0x0]\n"
+ "fmla v16.8h, v29.8h, v2.h[0]\n"
+ "fmla v20.8h, v29.8h, v1.h[0]\n"
"add x12, x12, #0x10\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v24.8h, v29.8h, v0.h[0]\n"
+ "ldr q29, [x10, #0x0]\n"
+ "fmla v9.8h, v28.8h, v4.h[0]\n"
"add x11, x11, #0x10\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v13.8h, v28.8h, v3.h[0]\n"
+ "fmla v17.8h, v28.8h, v2.h[0]\n"
"add x10, x10, #0x10\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "fmla v21.8h, v28.8h, v1.h[0]\n"
+ "fmla v25.8h, v28.8h, v0.h[0]\n"
+ "ldr q28, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v10.8h, v29.8h, v4.h[0]\n"
+ "fmla v14.8h, v29.8h, v3.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v1.h[0]\n"
+ "fmla v26.8h, v29.8h, v0.h[0]\n"
+ "fmla v11.8h, v28.8h, v4.h[0]\n"
+ "fmla v15.8h, v28.8h, v3.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v1.h[0]\n"
+ "fmla v27.8h, v28.8h, v0.h[0]\n"
"cbnz x27, 230b\n"
"231:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3812,49 +3812,49 @@ void a64_ffhybrid_fp16_mla_6x32 (
"add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 232f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v29.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmin v20.8h, v20.8h, v1.8h\n"
- "fmin v21.8h, v21.8h, v1.8h\n"
- "fmin v22.8h, v22.8h, v1.8h\n"
- "fmin v23.8h, v23.8h, v1.8h\n"
- "fmin v24.8h, v24.8h, v1.8h\n"
- "fmin v25.8h, v25.8h, v1.8h\n"
- "fmin v26.8h, v26.8h, v1.8h\n"
- "fmin v27.8h, v27.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
- "fmax v20.8h, v20.8h, v0.8h\n"
- "fmax v21.8h, v21.8h, v0.8h\n"
- "fmax v22.8h, v22.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v0.8h\n"
- "fmax v24.8h, v24.8h, v0.8h\n"
- "fmax v25.8h, v25.8h, v0.8h\n"
- "fmax v26.8h, v26.8h, v0.8h\n"
- "fmax v27.8h, v27.8h, v0.8h\n"
+ "ld1r { v28.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v29.8h\n"
+ "fmin v9.8h, v9.8h, v29.8h\n"
+ "fmin v10.8h, v10.8h, v29.8h\n"
+ "fmin v11.8h, v11.8h, v29.8h\n"
+ "fmin v12.8h, v12.8h, v29.8h\n"
+ "fmin v13.8h, v13.8h, v29.8h\n"
+ "fmin v14.8h, v14.8h, v29.8h\n"
+ "fmin v15.8h, v15.8h, v29.8h\n"
+ "fmin v16.8h, v16.8h, v29.8h\n"
+ "fmin v17.8h, v17.8h, v29.8h\n"
+ "fmin v18.8h, v18.8h, v29.8h\n"
+ "fmin v19.8h, v19.8h, v29.8h\n"
+ "fmin v20.8h, v20.8h, v29.8h\n"
+ "fmin v21.8h, v21.8h, v29.8h\n"
+ "fmin v22.8h, v22.8h, v29.8h\n"
+ "fmin v23.8h, v23.8h, v29.8h\n"
+ "fmin v24.8h, v24.8h, v29.8h\n"
+ "fmin v25.8h, v25.8h, v29.8h\n"
+ "fmin v26.8h, v26.8h, v29.8h\n"
+ "fmin v27.8h, v27.8h, v29.8h\n"
+ "fmax v8.8h, v8.8h, v28.8h\n"
+ "fmax v9.8h, v9.8h, v28.8h\n"
+ "fmax v10.8h, v10.8h, v28.8h\n"
+ "fmax v11.8h, v11.8h, v28.8h\n"
+ "fmax v12.8h, v12.8h, v28.8h\n"
+ "fmax v13.8h, v13.8h, v28.8h\n"
+ "fmax v14.8h, v14.8h, v28.8h\n"
+ "fmax v15.8h, v15.8h, v28.8h\n"
+ "fmax v16.8h, v16.8h, v28.8h\n"
+ "fmax v17.8h, v17.8h, v28.8h\n"
+ "fmax v18.8h, v18.8h, v28.8h\n"
+ "fmax v19.8h, v19.8h, v28.8h\n"
+ "fmax v20.8h, v20.8h, v28.8h\n"
+ "fmax v21.8h, v21.8h, v28.8h\n"
+ "fmax v22.8h, v22.8h, v28.8h\n"
+ "fmax v23.8h, v23.8h, v28.8h\n"
+ "fmax v24.8h, v24.8h, v28.8h\n"
+ "fmax v25.8h, v25.8h, v28.8h\n"
+ "fmax v26.8h, v26.8h, v28.8h\n"
+ "fmax v27.8h, v27.8h, v28.8h\n"
"232:" // Height 5: No activation
"cmp x14, #0x20\n"
"bge 249f\n"
@@ -4497,16 +4497,16 @@ void a64_ffhybrid_fp16_mla_6x32 (
"274:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 275f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 276f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -4518,11 +4518,11 @@ void a64_ffhybrid_fp16_mla_6x32 (
"b 276f\n"
"275:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"276:" // Height 6: input setup done
"cmp x27, #0x8\n"
"blt 279f\n"
@@ -5017,45 +5017,45 @@ void a64_ffhybrid_fp16_mla_6x32 (
"279:" // Height 6: Multiply loop: Main loop skip
"cbz x27, 281f\n"
"280:" // Height 6: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h7, [x26], #0x2\n"
+ "ldr h6, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "ldr h5, [x21], #0x2\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "ldr h5, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q0, [x11, #0x0]\n"
+ "fmla v8.8h, v1.8h, v7.h[0]\n"
+ "fmla v12.8h, v1.8h, v6.h[0]\n"
+ "fmla v16.8h, v1.8h, v5.h[0]\n"
+ "fmla v20.8h, v1.8h, v4.h[0]\n"
"add x12, x12, #0x10\n"
"add x11, x11, #0x10\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "fmla v28.8h, v6.8h, v5.h[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "fmla v24.8h, v1.8h, v3.h[0]\n"
+ "fmla v28.8h, v1.8h, v2.h[0]\n"
+ "ldr q1, [x10, #0x0]\n"
"add x10, x10, #0x10\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "fmla v25.8h, v7.8h, v4.h[0]\n"
- "fmla v29.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "fmla v9.8h, v0.8h, v7.h[0]\n"
+ "fmla v13.8h, v0.8h, v6.h[0]\n"
+ "fmla v17.8h, v0.8h, v5.h[0]\n"
+ "fmla v21.8h, v0.8h, v4.h[0]\n"
+ "fmla v25.8h, v0.8h, v3.h[0]\n"
+ "fmla v29.8h, v0.8h, v2.h[0]\n"
+ "ldr q0, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "fmla v30.8h, v6.8h, v5.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "fmla v10.8h, v1.8h, v7.h[0]\n"
+ "fmla v14.8h, v1.8h, v6.h[0]\n"
+ "fmla v18.8h, v1.8h, v5.h[0]\n"
+ "fmla v22.8h, v1.8h, v4.h[0]\n"
+ "fmla v26.8h, v1.8h, v3.h[0]\n"
+ "fmla v30.8h, v1.8h, v2.h[0]\n"
+ "fmla v11.8h, v0.8h, v7.h[0]\n"
+ "fmla v15.8h, v0.8h, v6.h[0]\n"
+ "fmla v19.8h, v0.8h, v5.h[0]\n"
+ "fmla v23.8h, v0.8h, v4.h[0]\n"
+ "fmla v27.8h, v0.8h, v3.h[0]\n"
+ "fmla v31.8h, v0.8h, v2.h[0]\n"
"cbnz x27, 280b\n"
"281:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
index 08f5aeb2d8..94fb84e409 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
index e0fbe17bad..b1cd6dc970 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
@@ -209,11 +209,11 @@ void a64_ffhybrid_fp32_mla_6x16 (
"16:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -230,37 +230,37 @@ void a64_ffhybrid_fp32_mla_6x16 (
"blt 20f\n"
"19:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "ldr q17, [x12, #0x10]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr q16, [x11, #0x10]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x12, #0x30]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x11, #0x30]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x10, #0x30]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x9, #0x30]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x8\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"add x26, x26, #0x10\n"
"ldr q0, [x26, #0x0]\n"
"add x12, x12, #0x40\n"
@@ -272,36 +272,36 @@ void a64_ffhybrid_fp32_mla_6x16 (
"bge 19b\n"
"20:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "ldr q17, [x12, #0x10]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr q16, [x11, #0x10]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x12, #0x30]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x11, #0x30]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x10, #0x30]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x9, #0x30]\n"
"sub x27, x27, #0x4\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"add x26, x26, #0x10\n"
"add x12, x12, #0x40\n"
"add x11, x11, #0x40\n"
@@ -310,16 +310,16 @@ void a64_ffhybrid_fp32_mla_6x16 (
"21:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 23f\n"
"22:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v8.4s, v16.4s, v18.s[0]\n"
"sub x27, x27, #0x1\n"
- "ldr q7, [x11, #0x0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr q17, [x11, #0x0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v9.4s, v17.4s, v18.s[0]\n"
+ "fmla v10.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v11.4s, v16.4s, v18.s[0]\n"
"add x12, x12, #0x10\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
@@ -332,17 +332,17 @@ void a64_ffhybrid_fp32_mla_6x16 (
"bne 16b\n"
"tbz %x[flags], #1, 24f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
"24:" // Height 1: No activation
"cmp x14, #0x10\n"
"bge 33f\n"
@@ -538,12 +538,12 @@ void a64_ffhybrid_fp32_mla_6x16 (
"50:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 51f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -551,7 +551,7 @@ void a64_ffhybrid_fp32_mla_6x16 (
"b 52f\n"
"51:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"52:" // Height 2: input setup done
"cmp x27, #0x4\n"
"blt 55f\n"
@@ -564,137 +564,137 @@ void a64_ffhybrid_fp32_mla_6x16 (
"53:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"sub x27, x27, #0x4\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x0]\n"
"cmp x27, #0x8\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "fmla v14.4s, v17.4s, v1.s[0]\n"
+ "ldr q17, [x12, #0x10]\n"
"add x26, x26, #0x10\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "fmla v15.4s, v16.4s, v1.s[0]\n"
+ "ldr q16, [x11, #0x10]\n"
"add x25, x25, #0x10\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "fmla v12.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "fmla v13.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "fmla v14.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "fmla v15.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "fmla v12.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "fmla v13.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "fmla v14.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x12, #0x30]\n"
"add x12, x12, #0x40\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "fmla v15.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v1.s[3]\n"
+ "ldr q17, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v1.s[3]\n"
+ "ldr q16, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v14.4s, v17.4s, v1.s[3]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v16.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 53b\n"
"54:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x0]\n"
"sub x27, x27, #0x4\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x0]\n"
"add x26, x26, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "fmla v14.4s, v17.4s, v1.s[0]\n"
+ "ldr q17, [x12, #0x10]\n"
"add x25, x25, #0x10\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "fmla v15.4s, v16.4s, v1.s[0]\n"
+ "ldr q16, [x11, #0x10]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "fmla v12.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "fmla v13.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "fmla v14.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x12, #0x20]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "fmla v15.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x11, #0x20]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "fmla v12.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "fmla v13.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x9, #0x20]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "fmla v14.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x12, #0x30]\n"
"add x12, x12, #0x40\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "fmla v15.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v1.s[3]\n"
+ "ldr q17, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v1.s[3]\n"
+ "ldr q16, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v14.4s, v17.4s, v1.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
+ "fmla v15.4s, v16.4s, v1.s[3]\n"
"55:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 57f\n"
"56:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "ldr q16, [x11, #0x0]\n"
+ "fmla v8.4s, v17.4s, v19.s[0]\n"
+ "fmla v12.4s, v17.4s, v18.s[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "fmla v9.4s, v16.4s, v19.s[0]\n"
+ "fmla v13.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x9, #0x0]\n"
+ "fmla v10.4s, v17.4s, v19.s[0]\n"
+ "fmla v14.4s, v17.4s, v18.s[0]\n"
"add x12, x12, #0x10\n"
"add x11, x11, #0x10\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v11.4s, v16.4s, v19.s[0]\n"
+ "fmla v15.4s, v16.4s, v18.s[0]\n"
"add x10, x10, #0x10\n"
"add x9, x9, #0x10\n"
"cbnz x27, 56b\n"
@@ -707,25 +707,25 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x25, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 58f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmin v12.4s, v12.4s, v17.4s\n"
+ "fmin v13.4s, v13.4s, v17.4s\n"
+ "fmin v14.4s, v14.4s, v17.4s\n"
+ "fmin v15.4s, v15.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
+ "fmax v12.4s, v12.4s, v16.4s\n"
+ "fmax v13.4s, v13.4s, v16.4s\n"
+ "fmax v14.4s, v14.4s, v16.4s\n"
+ "fmax v15.4s, v15.4s, v16.4s\n"
"58:" // Height 2: No activation
"cmp x14, #0x10\n"
"bge 67f\n"
@@ -970,13 +970,13 @@ void a64_ffhybrid_fp32_mla_6x16 (
"84:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 85f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 86f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -985,8 +985,8 @@ void a64_ffhybrid_fp32_mla_6x16 (
"b 86f\n"
"85:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"86:" // Height 3: input setup done
"cmp x27, #0x4\n"
"blt 89f\n"
@@ -1003,75 +1003,75 @@ void a64_ffhybrid_fp32_mla_6x16 (
"sub x27, x27, #0x4\n"
"cmp x27, #0x8\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q21, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x26, x26, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q20, [x9, #0x0]\n"
"add x25, x25, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v21.4s, v0.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[0]\n"
"add x24, x24, #0x10\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v18.4s, v21.4s, v2.s[0]\n"
+ "ldr q21, [x12, #0x10]\n"
+ "fmla v11.4s, v20.4s, v0.s[0]\n"
+ "fmla v15.4s, v20.4s, v1.s[0]\n"
+ "fmla v19.4s, v20.4s, v2.s[0]\n"
+ "ldr q20, [x11, #0x10]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v12.4s, v21.4s, v1.s[1]\n"
+ "fmla v16.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x10, #0x10]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v17.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "fmla v10.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v18.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x12, #0x20]\n"
+ "fmla v11.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "fmla v19.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x11, #0x20]\n"
+ "fmla v8.4s, v21.4s, v0.s[2]\n"
+ "fmla v12.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "fmla v9.4s, v20.4s, v0.s[2]\n"
+ "fmla v13.4s, v20.4s, v1.s[2]\n"
+ "fmla v17.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x9, #0x20]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v21.4s, v1.s[2]\n"
+ "fmla v18.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x12, #0x30]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
"add x12, x12, #0x40\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v15.4s, v20.4s, v1.s[2]\n"
+ "fmla v19.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v12.4s, v21.4s, v1.s[3]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "ldr q21, [x10, #0x30]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
"add x10, x10, #0x40\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v13.4s, v20.4s, v1.s[3]\n"
+ "fmla v17.4s, v20.4s, v2.s[3]\n"
+ "ldr q20, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v10.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "fmla v18.4s, v21.4s, v2.s[3]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v20.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v19.4s, v20.4s, v2.s[3]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 87b\n"
@@ -1081,98 +1081,98 @@ void a64_ffhybrid_fp32_mla_6x16 (
"sub x27, x27, #0x4\n"
"add x26, x26, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q21, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x25, x25, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q20, [x9, #0x0]\n"
"add x24, x24, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v10.4s, v21.4s, v0.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[0]\n"
+ "fmla v18.4s, v21.4s, v2.s[0]\n"
+ "ldr q21, [x12, #0x10]\n"
+ "fmla v11.4s, v20.4s, v0.s[0]\n"
+ "fmla v15.4s, v20.4s, v1.s[0]\n"
+ "fmla v19.4s, v20.4s, v2.s[0]\n"
+ "ldr q20, [x11, #0x10]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v12.4s, v21.4s, v1.s[1]\n"
+ "fmla v16.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x10, #0x10]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v17.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "fmla v10.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v18.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x12, #0x20]\n"
+ "fmla v11.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "fmla v19.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x11, #0x20]\n"
+ "fmla v8.4s, v21.4s, v0.s[2]\n"
+ "fmla v12.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "fmla v9.4s, v20.4s, v0.s[2]\n"
+ "fmla v13.4s, v20.4s, v1.s[2]\n"
+ "fmla v17.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x9, #0x20]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v21.4s, v1.s[2]\n"
+ "fmla v18.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x12, #0x30]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
"add x12, x12, #0x40\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v15.4s, v20.4s, v1.s[2]\n"
+ "fmla v19.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v12.4s, v21.4s, v1.s[3]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "ldr q21, [x10, #0x30]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
"add x10, x10, #0x40\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v13.4s, v20.4s, v1.s[3]\n"
+ "fmla v17.4s, v20.4s, v2.s[3]\n"
+ "ldr q20, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v10.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "fmla v18.4s, v21.4s, v2.s[3]\n"
+ "fmla v11.4s, v20.4s, v0.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "fmla v19.4s, v20.4s, v2.s[3]\n"
"89:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 91f\n"
"90:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x12, #0x0]\n"
+ "fmla v8.4s, v21.4s, v24.s[0]\n"
+ "fmla v12.4s, v21.4s, v23.s[0]\n"
+ "ldr q20, [x11, #0x0]\n"
+ "fmla v16.4s, v21.4s, v22.s[0]\n"
+ "ldr q21, [x10, #0x0]\n"
+ "fmla v9.4s, v20.4s, v24.s[0]\n"
+ "fmla v13.4s, v20.4s, v23.s[0]\n"
+ "fmla v17.4s, v20.4s, v22.s[0]\n"
+ "ldr q20, [x9, #0x0]\n"
"add x12, x12, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v21.4s, v24.s[0]\n"
+ "fmla v14.4s, v21.4s, v23.s[0]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v18.4s, v21.4s, v22.s[0]\n"
+ "fmla v11.4s, v20.4s, v24.s[0]\n"
"add x9, x9, #0x10\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v15.4s, v20.4s, v23.s[0]\n"
+ "fmla v19.4s, v20.4s, v22.s[0]\n"
"cbnz x27, 90b\n"
"91:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1184,33 +1184,33 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 92f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v21.4s\n"
+ "fmin v9.4s, v9.4s, v21.4s\n"
+ "fmin v10.4s, v10.4s, v21.4s\n"
+ "fmin v11.4s, v11.4s, v21.4s\n"
+ "fmin v12.4s, v12.4s, v21.4s\n"
+ "fmin v13.4s, v13.4s, v21.4s\n"
+ "fmin v14.4s, v14.4s, v21.4s\n"
+ "fmin v15.4s, v15.4s, v21.4s\n"
+ "fmin v16.4s, v16.4s, v21.4s\n"
+ "fmin v17.4s, v17.4s, v21.4s\n"
+ "fmin v18.4s, v18.4s, v21.4s\n"
+ "fmin v19.4s, v19.4s, v21.4s\n"
+ "fmax v8.4s, v8.4s, v20.4s\n"
+ "fmax v9.4s, v9.4s, v20.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v20.4s\n"
+ "fmax v14.4s, v14.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v20.4s\n"
+ "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v20.4s\n"
+ "fmax v19.4s, v19.4s, v20.4s\n"
"92:" // Height 3: No activation
"cmp x14, #0x10\n"
"bge 101f\n"
@@ -1504,14 +1504,14 @@ void a64_ffhybrid_fp32_mla_6x16 (
"118:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 119f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 120f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1521,9 +1521,9 @@ void a64_ffhybrid_fp32_mla_6x16 (
"b 120f\n"
"119:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"120:" // Height 4: input setup done
"cmp x27, #0x4\n"
"blt 123f\n"
@@ -1542,7 +1542,7 @@ void a64_ffhybrid_fp32_mla_6x16 (
"cmp x27, #0x8\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"add x26, x26, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1550,84 +1550,84 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x24, x24, #0x10\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x0]\n"
"add x23, x23, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
+ "fmla v10.4s, v25.4s, v0.s[0]\n"
+ "fmla v14.4s, v25.4s, v1.s[0]\n"
+ "fmla v18.4s, v25.4s, v2.s[0]\n"
+ "fmla v22.4s, v25.4s, v3.s[0]\n"
+ "ldr q25, [x12, #0x10]\n"
+ "fmla v11.4s, v24.4s, v0.s[0]\n"
+ "fmla v15.4s, v24.4s, v1.s[0]\n"
+ "fmla v19.4s, v24.4s, v2.s[0]\n"
+ "fmla v23.4s, v24.4s, v3.s[0]\n"
+ "ldr q24, [x11, #0x10]\n"
+ "fmla v8.4s, v25.4s, v0.s[1]\n"
+ "fmla v12.4s, v25.4s, v1.s[1]\n"
+ "fmla v16.4s, v25.4s, v2.s[1]\n"
+ "fmla v20.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "fmla v9.4s, v24.4s, v0.s[1]\n"
+ "fmla v13.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v21.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x9, #0x10]\n"
+ "fmla v10.4s, v25.4s, v0.s[1]\n"
+ "fmla v14.4s, v25.4s, v1.s[1]\n"
+ "fmla v18.4s, v25.4s, v2.s[1]\n"
+ "fmla v22.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x12, #0x20]\n"
+ "fmla v11.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v1.s[1]\n"
+ "fmla v19.4s, v24.4s, v2.s[1]\n"
+ "fmla v23.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x11, #0x20]\n"
+ "fmla v8.4s, v25.4s, v0.s[2]\n"
+ "fmla v12.4s, v25.4s, v1.s[2]\n"
+ "fmla v16.4s, v25.4s, v2.s[2]\n"
+ "fmla v20.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x10, #0x20]\n"
+ "fmla v9.4s, v24.4s, v0.s[2]\n"
+ "fmla v13.4s, v24.4s, v1.s[2]\n"
+ "fmla v17.4s, v24.4s, v2.s[2]\n"
+ "fmla v21.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x9, #0x20]\n"
+ "fmla v10.4s, v25.4s, v0.s[2]\n"
+ "fmla v14.4s, v25.4s, v1.s[2]\n"
+ "fmla v18.4s, v25.4s, v2.s[2]\n"
+ "fmla v22.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x12, #0x30]\n"
"add x12, x12, #0x40\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v11.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v1.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[2]\n"
+ "fmla v23.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
+ "fmla v8.4s, v25.4s, v0.s[3]\n"
+ "fmla v12.4s, v25.4s, v1.s[3]\n"
+ "fmla v16.4s, v25.4s, v2.s[3]\n"
+ "fmla v20.4s, v25.4s, v3.s[3]\n"
+ "ldr q25, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v9.4s, v24.4s, v0.s[3]\n"
+ "fmla v13.4s, v24.4s, v1.s[3]\n"
+ "fmla v17.4s, v24.4s, v2.s[3]\n"
+ "fmla v21.4s, v24.4s, v3.s[3]\n"
+ "ldr q24, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v10.4s, v25.4s, v0.s[3]\n"
+ "fmla v14.4s, v25.4s, v1.s[3]\n"
+ "fmla v18.4s, v25.4s, v2.s[3]\n"
+ "fmla v22.4s, v25.4s, v3.s[3]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v24.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v24.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v23.4s, v24.4s, v3.s[3]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 121b\n"
@@ -1638,7 +1638,7 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x26, x26, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x0]\n"
"add x25, x25, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1646,111 +1646,111 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x23, x23, #0x10\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
+ "ldr q24, [x9, #0x0]\n"
+ "fmla v10.4s, v25.4s, v0.s[0]\n"
+ "fmla v14.4s, v25.4s, v1.s[0]\n"
+ "fmla v18.4s, v25.4s, v2.s[0]\n"
+ "fmla v22.4s, v25.4s, v3.s[0]\n"
+ "ldr q25, [x12, #0x10]\n"
+ "fmla v11.4s, v24.4s, v0.s[0]\n"
+ "fmla v15.4s, v24.4s, v1.s[0]\n"
+ "fmla v19.4s, v24.4s, v2.s[0]\n"
+ "fmla v23.4s, v24.4s, v3.s[0]\n"
+ "ldr q24, [x11, #0x10]\n"
+ "fmla v8.4s, v25.4s, v0.s[1]\n"
+ "fmla v12.4s, v25.4s, v1.s[1]\n"
+ "fmla v16.4s, v25.4s, v2.s[1]\n"
+ "fmla v20.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "fmla v9.4s, v24.4s, v0.s[1]\n"
+ "fmla v13.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v21.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x9, #0x10]\n"
+ "fmla v10.4s, v25.4s, v0.s[1]\n"
+ "fmla v14.4s, v25.4s, v1.s[1]\n"
+ "fmla v18.4s, v25.4s, v2.s[1]\n"
+ "fmla v22.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x12, #0x20]\n"
+ "fmla v11.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v1.s[1]\n"
+ "fmla v19.4s, v24.4s, v2.s[1]\n"
+ "fmla v23.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x11, #0x20]\n"
+ "fmla v8.4s, v25.4s, v0.s[2]\n"
+ "fmla v12.4s, v25.4s, v1.s[2]\n"
+ "fmla v16.4s, v25.4s, v2.s[2]\n"
+ "fmla v20.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x10, #0x20]\n"
+ "fmla v9.4s, v24.4s, v0.s[2]\n"
+ "fmla v13.4s, v24.4s, v1.s[2]\n"
+ "fmla v17.4s, v24.4s, v2.s[2]\n"
+ "fmla v21.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x9, #0x20]\n"
+ "fmla v10.4s, v25.4s, v0.s[2]\n"
+ "fmla v14.4s, v25.4s, v1.s[2]\n"
+ "fmla v18.4s, v25.4s, v2.s[2]\n"
+ "fmla v22.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x12, #0x30]\n"
"add x12, x12, #0x40\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v11.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v1.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[2]\n"
+ "fmla v23.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
+ "fmla v8.4s, v25.4s, v0.s[3]\n"
+ "fmla v12.4s, v25.4s, v1.s[3]\n"
+ "fmla v16.4s, v25.4s, v2.s[3]\n"
+ "fmla v20.4s, v25.4s, v3.s[3]\n"
+ "ldr q25, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v9.4s, v24.4s, v0.s[3]\n"
+ "fmla v13.4s, v24.4s, v1.s[3]\n"
+ "fmla v17.4s, v24.4s, v2.s[3]\n"
+ "fmla v21.4s, v24.4s, v3.s[3]\n"
+ "ldr q24, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v10.4s, v25.4s, v0.s[3]\n"
+ "fmla v14.4s, v25.4s, v1.s[3]\n"
+ "fmla v18.4s, v25.4s, v2.s[3]\n"
+ "fmla v22.4s, v25.4s, v3.s[3]\n"
+ "fmla v11.4s, v24.4s, v0.s[3]\n"
+ "fmla v15.4s, v24.4s, v1.s[3]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "fmla v23.4s, v24.4s, v3.s[3]\n"
"123:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 125f\n"
"124:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr q25, [x12, #0x0]\n"
+ "ldr q24, [x11, #0x0]\n"
+ "fmla v8.4s, v25.4s, v29.s[0]\n"
+ "fmla v12.4s, v25.4s, v28.s[0]\n"
+ "fmla v16.4s, v25.4s, v27.s[0]\n"
+ "fmla v20.4s, v25.4s, v26.s[0]\n"
+ "ldr q25, [x10, #0x0]\n"
"add x12, x12, #0x10\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v9.4s, v24.4s, v29.s[0]\n"
+ "fmla v13.4s, v24.4s, v28.s[0]\n"
"add x11, x11, #0x10\n"
"add x10, x10, #0x10\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "fmla v17.4s, v24.4s, v27.s[0]\n"
+ "fmla v21.4s, v24.4s, v26.s[0]\n"
+ "ldr q24, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v10.4s, v25.4s, v29.s[0]\n"
+ "fmla v14.4s, v25.4s, v28.s[0]\n"
+ "fmla v18.4s, v25.4s, v27.s[0]\n"
+ "fmla v22.4s, v25.4s, v26.s[0]\n"
+ "fmla v11.4s, v24.4s, v29.s[0]\n"
+ "fmla v15.4s, v24.4s, v28.s[0]\n"
+ "fmla v19.4s, v24.4s, v27.s[0]\n"
+ "fmla v23.4s, v24.4s, v26.s[0]\n"
"cbnz x27, 124b\n"
"125:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1763,41 +1763,41 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 126f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v25.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v25.4s\n"
+ "fmin v9.4s, v9.4s, v25.4s\n"
+ "fmin v10.4s, v10.4s, v25.4s\n"
+ "fmin v11.4s, v11.4s, v25.4s\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmin v20.4s, v20.4s, v25.4s\n"
+ "fmin v21.4s, v21.4s, v25.4s\n"
+ "fmin v22.4s, v22.4s, v25.4s\n"
+ "fmin v23.4s, v23.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v24.4s\n"
+ "fmax v9.4s, v9.4s, v24.4s\n"
+ "fmax v10.4s, v10.4s, v24.4s\n"
+ "fmax v11.4s, v11.4s, v24.4s\n"
+ "fmax v12.4s, v12.4s, v24.4s\n"
+ "fmax v13.4s, v13.4s, v24.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v23.4s, v23.4s, v24.4s\n"
"126:" // Height 4: No activation
"cmp x14, #0x10\n"
"bge 135f\n"
@@ -2140,15 +2140,15 @@ void a64_ffhybrid_fp32_mla_6x16 (
"152:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 153f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 154f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -2159,10 +2159,10 @@ void a64_ffhybrid_fp32_mla_6x16 (
"b 154f\n"
"153:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"154:" // Height 5: input setup done
"cmp x27, #0x4\n"
"blt 157f\n"
@@ -2185,7 +2185,7 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x24, x24, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2194,98 +2194,98 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x22, x22, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q28, [x9, #0x0]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v18.4s, v29.4s, v2.s[0]\n"
+ "fmla v22.4s, v29.4s, v3.s[0]\n"
+ "fmla v26.4s, v29.4s, v4.s[0]\n"
+ "ldr q29, [x12, #0x10]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v2.s[0]\n"
+ "fmla v23.4s, v28.4s, v3.s[0]\n"
+ "fmla v27.4s, v28.4s, v4.s[0]\n"
+ "ldr q28, [x11, #0x10]\n"
+ "fmla v8.4s, v29.4s, v0.s[1]\n"
+ "fmla v12.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[1]\n"
+ "fmla v20.4s, v29.4s, v3.s[1]\n"
+ "fmla v24.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "fmla v9.4s, v28.4s, v0.s[1]\n"
+ "fmla v13.4s, v28.4s, v1.s[1]\n"
+ "fmla v17.4s, v28.4s, v2.s[1]\n"
+ "fmla v21.4s, v28.4s, v3.s[1]\n"
+ "fmla v25.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x9, #0x10]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v14.4s, v29.4s, v1.s[1]\n"
+ "fmla v18.4s, v29.4s, v2.s[1]\n"
+ "fmla v22.4s, v29.4s, v3.s[1]\n"
+ "fmla v26.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x12, #0x20]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[1]\n"
+ "fmla v19.4s, v28.4s, v2.s[1]\n"
+ "fmla v23.4s, v28.4s, v3.s[1]\n"
+ "fmla v27.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x11, #0x20]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v12.4s, v29.4s, v1.s[2]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v20.4s, v29.4s, v3.s[2]\n"
+ "fmla v24.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v13.4s, v28.4s, v1.s[2]\n"
+ "fmla v17.4s, v28.4s, v2.s[2]\n"
+ "fmla v21.4s, v28.4s, v3.s[2]\n"
+ "fmla v25.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x9, #0x20]\n"
+ "fmla v10.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v18.4s, v29.4s, v2.s[2]\n"
+ "fmla v22.4s, v29.4s, v3.s[2]\n"
+ "fmla v26.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x12, #0x30]\n"
+ "fmla v11.4s, v28.4s, v0.s[2]\n"
"add x12, x12, #0x40\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v19.4s, v28.4s, v2.s[2]\n"
+ "fmla v23.4s, v28.4s, v3.s[2]\n"
+ "fmla v27.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v8.4s, v29.4s, v0.s[3]\n"
+ "fmla v12.4s, v29.4s, v1.s[3]\n"
+ "fmla v16.4s, v29.4s, v2.s[3]\n"
+ "fmla v20.4s, v29.4s, v3.s[3]\n"
+ "fmla v24.4s, v29.4s, v4.s[3]\n"
+ "ldr q29, [x10, #0x30]\n"
+ "fmla v9.4s, v28.4s, v0.s[3]\n"
"add x10, x10, #0x40\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v13.4s, v28.4s, v1.s[3]\n"
+ "fmla v17.4s, v28.4s, v2.s[3]\n"
+ "fmla v21.4s, v28.4s, v3.s[3]\n"
+ "fmla v25.4s, v28.4s, v4.s[3]\n"
+ "ldr q28, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "fmla v18.4s, v29.4s, v2.s[3]\n"
+ "fmla v22.4s, v29.4s, v3.s[3]\n"
+ "fmla v26.4s, v29.4s, v4.s[3]\n"
"ldr q6, [x12, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v28.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v19.4s, v28.4s, v2.s[3]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v23.4s, v28.4s, v3.s[3]\n"
"ldr q3, [x23, #0x0]\n"
- "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v27.4s, v28.4s, v4.s[3]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x11, #0x0]\n"
"bge 155b\n"
@@ -2299,7 +2299,7 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x0]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x23, x23, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2307,130 +2307,130 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x22, x22, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x12, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x10, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x9, #0x10]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x12, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x11, #0x20]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x9, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x12, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q28, [x9, #0x0]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v18.4s, v29.4s, v2.s[0]\n"
+ "fmla v22.4s, v29.4s, v3.s[0]\n"
+ "fmla v26.4s, v29.4s, v4.s[0]\n"
+ "ldr q29, [x12, #0x10]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v2.s[0]\n"
+ "fmla v23.4s, v28.4s, v3.s[0]\n"
+ "fmla v27.4s, v28.4s, v4.s[0]\n"
+ "ldr q28, [x11, #0x10]\n"
+ "fmla v8.4s, v29.4s, v0.s[1]\n"
+ "fmla v12.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[1]\n"
+ "fmla v20.4s, v29.4s, v3.s[1]\n"
+ "fmla v24.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "fmla v9.4s, v28.4s, v0.s[1]\n"
+ "fmla v13.4s, v28.4s, v1.s[1]\n"
+ "fmla v17.4s, v28.4s, v2.s[1]\n"
+ "fmla v21.4s, v28.4s, v3.s[1]\n"
+ "fmla v25.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x9, #0x10]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v14.4s, v29.4s, v1.s[1]\n"
+ "fmla v18.4s, v29.4s, v2.s[1]\n"
+ "fmla v22.4s, v29.4s, v3.s[1]\n"
+ "fmla v26.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x12, #0x20]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[1]\n"
+ "fmla v19.4s, v28.4s, v2.s[1]\n"
+ "fmla v23.4s, v28.4s, v3.s[1]\n"
+ "fmla v27.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x11, #0x20]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v12.4s, v29.4s, v1.s[2]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v20.4s, v29.4s, v3.s[2]\n"
+ "fmla v24.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v13.4s, v28.4s, v1.s[2]\n"
+ "fmla v17.4s, v28.4s, v2.s[2]\n"
+ "fmla v21.4s, v28.4s, v3.s[2]\n"
+ "fmla v25.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x9, #0x20]\n"
+ "fmla v10.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v18.4s, v29.4s, v2.s[2]\n"
+ "fmla v22.4s, v29.4s, v3.s[2]\n"
+ "fmla v26.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x12, #0x30]\n"
+ "fmla v11.4s, v28.4s, v0.s[2]\n"
"add x12, x12, #0x40\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x11, #0x30]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v19.4s, v28.4s, v2.s[2]\n"
+ "fmla v23.4s, v28.4s, v3.s[2]\n"
+ "fmla v27.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x11, #0x30]\n"
"add x11, x11, #0x40\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x10, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v8.4s, v29.4s, v0.s[3]\n"
+ "fmla v12.4s, v29.4s, v1.s[3]\n"
+ "fmla v16.4s, v29.4s, v2.s[3]\n"
+ "fmla v20.4s, v29.4s, v3.s[3]\n"
+ "fmla v24.4s, v29.4s, v4.s[3]\n"
+ "ldr q29, [x10, #0x30]\n"
+ "fmla v9.4s, v28.4s, v0.s[3]\n"
"add x10, x10, #0x40\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x9, #0x30]\n"
+ "fmla v13.4s, v28.4s, v1.s[3]\n"
+ "fmla v17.4s, v28.4s, v2.s[3]\n"
+ "fmla v21.4s, v28.4s, v3.s[3]\n"
+ "fmla v25.4s, v28.4s, v4.s[3]\n"
+ "ldr q28, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v26.4s, v6.4s, v4.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
- "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "fmla v18.4s, v29.4s, v2.s[3]\n"
+ "fmla v22.4s, v29.4s, v3.s[3]\n"
+ "fmla v26.4s, v29.4s, v4.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "fmla v15.4s, v28.4s, v1.s[3]\n"
+ "fmla v19.4s, v28.4s, v2.s[3]\n"
+ "fmla v23.4s, v28.4s, v3.s[3]\n"
+ "fmla v27.4s, v28.4s, v4.s[3]\n"
"157:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 159f\n"
"158:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "ldr s0, [x24], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q29, [x12, #0x0]\n"
+ "fmla v8.4s, v29.4s, v2.s[0]\n"
+ "fmla v12.4s, v29.4s, v1.s[0]\n"
+ "ldr q28, [x11, #0x0]\n"
+ "fmla v16.4s, v29.4s, v0.s[0]\n"
+ "fmla v20.4s, v29.4s, v31.s[0]\n"
"add x12, x12, #0x10\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v24.4s, v29.4s, v30.s[0]\n"
+ "ldr q29, [x10, #0x0]\n"
+ "fmla v9.4s, v28.4s, v2.s[0]\n"
"add x11, x11, #0x10\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v17.4s, v28.4s, v0.s[0]\n"
"add x10, x10, #0x10\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "fmla v21.4s, v28.4s, v31.s[0]\n"
+ "fmla v25.4s, v28.4s, v30.s[0]\n"
+ "ldr q28, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v10.4s, v29.4s, v2.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v18.4s, v29.4s, v0.s[0]\n"
+ "fmla v22.4s, v29.4s, v31.s[0]\n"
+ "fmla v26.4s, v29.4s, v30.s[0]\n"
+ "fmla v11.4s, v28.4s, v2.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v0.s[0]\n"
+ "fmla v23.4s, v28.4s, v31.s[0]\n"
+ "fmla v27.4s, v28.4s, v30.s[0]\n"
"cbnz x27, 158b\n"
"159:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2444,49 +2444,49 @@ void a64_ffhybrid_fp32_mla_6x16 (
"add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 160f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v29.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmin v24.4s, v24.4s, v1.4s\n"
- "fmin v25.4s, v25.4s, v1.4s\n"
- "fmin v26.4s, v26.4s, v1.4s\n"
- "fmin v27.4s, v27.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v0.4s\n"
- "fmax v26.4s, v26.4s, v0.4s\n"
- "fmax v27.4s, v27.4s, v0.4s\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "fmin v9.4s, v9.4s, v29.4s\n"
+ "fmin v10.4s, v10.4s, v29.4s\n"
+ "fmin v11.4s, v11.4s, v29.4s\n"
+ "fmin v12.4s, v12.4s, v29.4s\n"
+ "fmin v13.4s, v13.4s, v29.4s\n"
+ "fmin v14.4s, v14.4s, v29.4s\n"
+ "fmin v15.4s, v15.4s, v29.4s\n"
+ "fmin v16.4s, v16.4s, v29.4s\n"
+ "fmin v17.4s, v17.4s, v29.4s\n"
+ "fmin v18.4s, v18.4s, v29.4s\n"
+ "fmin v19.4s, v19.4s, v29.4s\n"
+ "fmin v20.4s, v20.4s, v29.4s\n"
+ "fmin v21.4s, v21.4s, v29.4s\n"
+ "fmin v22.4s, v22.4s, v29.4s\n"
+ "fmin v23.4s, v23.4s, v29.4s\n"
+ "fmin v24.4s, v24.4s, v29.4s\n"
+ "fmin v25.4s, v25.4s, v29.4s\n"
+ "fmin v26.4s, v26.4s, v29.4s\n"
+ "fmin v27.4s, v27.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v28.4s\n"
+ "fmax v9.4s, v9.4s, v28.4s\n"
+ "fmax v10.4s, v10.4s, v28.4s\n"
+ "fmax v11.4s, v11.4s, v28.4s\n"
+ "fmax v12.4s, v12.4s, v28.4s\n"
+ "fmax v13.4s, v13.4s, v28.4s\n"
+ "fmax v14.4s, v14.4s, v28.4s\n"
+ "fmax v15.4s, v15.4s, v28.4s\n"
+ "fmax v16.4s, v16.4s, v28.4s\n"
+ "fmax v17.4s, v17.4s, v28.4s\n"
+ "fmax v18.4s, v18.4s, v28.4s\n"
+ "fmax v19.4s, v19.4s, v28.4s\n"
+ "fmax v20.4s, v20.4s, v28.4s\n"
+ "fmax v21.4s, v21.4s, v28.4s\n"
+ "fmax v22.4s, v22.4s, v28.4s\n"
+ "fmax v23.4s, v23.4s, v28.4s\n"
+ "fmax v24.4s, v24.4s, v28.4s\n"
+ "fmax v25.4s, v25.4s, v28.4s\n"
+ "fmax v26.4s, v26.4s, v28.4s\n"
+ "fmax v27.4s, v27.4s, v28.4s\n"
"160:" // Height 5: No activation
"cmp x14, #0x10\n"
"bge 169f\n"
@@ -2881,16 +2881,16 @@ void a64_ffhybrid_fp32_mla_6x16 (
"186:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 187f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 188f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -2902,11 +2902,11 @@ void a64_ffhybrid_fp32_mla_6x16 (
"b 188f\n"
"187:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"188:" // Height 6: input setup done
"cmp x27, #0x4\n"
"blt 191f\n"
@@ -3177,45 +3177,45 @@ void a64_ffhybrid_fp32_mla_6x16 (
"191:" // Height 6: Multiply loop: Main loop skip
"cbz x27, 193f\n"
"192:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x12, #0x0]\n"
- "ldr q7, [x11, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q0, [x11, #0x0]\n"
+ "fmla v8.4s, v1.4s, v7.s[0]\n"
+ "fmla v12.4s, v1.4s, v6.s[0]\n"
+ "fmla v16.4s, v1.4s, v5.s[0]\n"
+ "fmla v20.4s, v1.4s, v4.s[0]\n"
"add x12, x12, #0x10\n"
"add x11, x11, #0x10\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "fmla v28.4s, v6.4s, v5.s[0]\n"
- "ldr q6, [x10, #0x0]\n"
+ "fmla v24.4s, v1.4s, v3.s[0]\n"
+ "fmla v28.4s, v1.4s, v2.s[0]\n"
+ "ldr q1, [x10, #0x0]\n"
"add x10, x10, #0x10\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v4.s[0]\n"
- "fmla v29.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x9, #0x0]\n"
+ "fmla v9.4s, v0.4s, v7.s[0]\n"
+ "fmla v13.4s, v0.4s, v6.s[0]\n"
+ "fmla v17.4s, v0.4s, v5.s[0]\n"
+ "fmla v21.4s, v0.4s, v4.s[0]\n"
+ "fmla v25.4s, v0.4s, v3.s[0]\n"
+ "fmla v29.4s, v0.4s, v2.s[0]\n"
+ "ldr q0, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "fmla v30.4s, v6.4s, v5.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "fmla v10.4s, v1.4s, v7.s[0]\n"
+ "fmla v14.4s, v1.4s, v6.s[0]\n"
+ "fmla v18.4s, v1.4s, v5.s[0]\n"
+ "fmla v22.4s, v1.4s, v4.s[0]\n"
+ "fmla v26.4s, v1.4s, v3.s[0]\n"
+ "fmla v30.4s, v1.4s, v2.s[0]\n"
+ "fmla v11.4s, v0.4s, v7.s[0]\n"
+ "fmla v15.4s, v0.4s, v6.s[0]\n"
+ "fmla v19.4s, v0.4s, v5.s[0]\n"
+ "fmla v23.4s, v0.4s, v4.s[0]\n"
+ "fmla v27.4s, v0.4s, v3.s[0]\n"
+ "fmla v31.4s, v0.4s, v2.s[0]\n"
"cbnz x27, 192b\n"
"193:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
index af2c1e5ae0..923d008bb1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
index 1f707fa962..8961e615d7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -283,11 +283,11 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"21:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -307,32 +307,32 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"24:" // Height 1: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ "ldr q23, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q22, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ "ldr q21, [x9, #0x10]\n"
+ ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
+ "ldr q24, [x28, #0x0]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x10]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ "ldr q22, [x27, #0x0]\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x27, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x8\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
"add x12, x12, #0x20\n"
"ldr q4, [x12, #0x0]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x12, #0x10]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"ld1 { v0.4s }, [x24], #0x10\n"
"ldr q7, [x11, #0x10]\n"
"add x10, x10, #0x20\n"
@@ -343,28 +343,28 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"25:" // Height 1: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q21, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ "ldr q24, [x9, #0x10]\n"
+ ".inst 0x6e56ec0a // bfmmla v10.4s, v0.8h, v22.8h\n"
+ "ldr q23, [x28, #0x0]\n"
+ ".inst 0x6e59ec10 // bfmmla v16.4s, v0.8h, v25.8h\n"
+ "ldr q22, [x28, #0x10]\n"
+ ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x27, #0x0]\n"
+ ".inst 0x6e58ec11 // bfmmla v17.4s, v0.8h, v24.8h\n"
+ "ldr q3, [x27, #0x10]\n"
"sub x25, x25, #0x4\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e57ec0c // bfmmla v12.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec12 // bfmmla v18.4s, v0.8h, v22.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec0d // bfmmla v13.4s, v0.8h, v21.8h\n"
+ ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n"
"add x10, x10, #0x20\n"
"add x9, x9, #0x20\n"
"add x28, x28, #0x20\n"
@@ -380,31 +380,31 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"27:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr s0, [x24, #0x0]\n"
"28:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q4, [x12, #0x0]\n"
- "ldr q5, [x12, #0x10]\n"
+ "ldr q21, [x12, #0x0]\n"
+ "ldr q30, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q6, [x11, #0x0]\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q4, [x10, #0x0]\n"
- "ldr q5, [x10, #0x10]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- "ldr q6, [x27, #0x0]\n"
- "ldr q7, [x27, #0x10]\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec08 // bfmmla v8.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x11, #0x0]\n"
+ "ldr q22, [x11, #0x10]\n"
+ ".inst 0x6e5eec0e // bfmmla v14.4s, v0.8h, v30.8h\n"
+ ".inst 0x6e55ec09 // bfmmla v9.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x10, #0x0]\n"
+ "ldr q23, [x10, #0x10]\n"
+ ".inst 0x6e56ec0f // bfmmla v15.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0a // bfmmla v10.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x9, #0x0]\n"
+ "ldr q22, [x9, #0x10]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q23, [x28, #0x10]\n"
+ ".inst 0x6e56ec11 // bfmmla v17.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0c // bfmmla v12.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x27, #0x0]\n"
+ "ldr q21, [x27, #0x10]\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
"add x10, x10, #0x20\n"
@@ -424,21 +424,21 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 30f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v22.4s\n"
+ "fmin v9.4s, v9.4s, v22.4s\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "fmin v13.4s, v13.4s, v22.4s\n"
+ "fmax v8.4s, v8.4s, v21.4s\n"
+ "fmax v9.4s, v9.4s, v21.4s\n"
+ "fmax v10.4s, v10.4s, v21.4s\n"
+ "fmax v11.4s, v11.4s, v21.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
"30:" // Height 1: No activation
"cmp x14, #0x18\n"
"bge 43f\n"
@@ -744,12 +744,12 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"65:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 66f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 67f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -757,7 +757,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"b 67f\n"
"66:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"67:" // Height 2: input setup done
"cmp x25, #0x4\n"
"blt 70f\n"
@@ -774,32 +774,32 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
"ld1 { v1.4s }, [x23], #0x10\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ "ldr q30, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ "ldr q23, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q22, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ "ldr q21, [x9, #0x10]\n"
+ ".inst 0x6e5eec0a // bfmmla v10.4s, v0.8h, v30.8h\n"
+ "ldr q2, [x28, #0x0]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x10]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ "ldr q22, [x27, #0x0]\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x27, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x8\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e42ec0c // bfmmla v12.4s, v0.8h, v2.8h\n"
"ldr q4, [x12, #0x0]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x12, #0x10]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"ld1 { v0.4s }, [x24], #0x10\n"
"add x10, x10, #0x20\n"
"ldr q7, [x11, #0x10]\n"
@@ -811,28 +811,28 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ "ldr q23, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
+ "ldr q22, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ "ldr q21, [x9, #0x10]\n"
+ ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
+ "ldr q24, [x28, #0x0]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x10]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ "ldr q22, [x27, #0x0]\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x27, #0x10]\n"
"sub x25, x25, #0x4\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"add x10, x10, #0x20\n"
"add x9, x9, #0x20\n"
"add x28, x28, #0x20\n"
@@ -851,32 +851,32 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr s0, [x24, #0x0]\n"
"ldr s1, [x23, #0x0]\n"
"72:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q4, [x12, #0x0]\n"
- "ldr q5, [x12, #0x10]\n"
+ "ldr q24, [x12, #0x0]\n"
+ "ldr q23, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ldr q6, [x11, #0x0]\n"
- "ldr q7, [x11, #0x10]\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q4, [x10, #0x0]\n"
- "ldr q5, [x10, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q6, [x27, #0x0]\n"
- "ldr q7, [x27, #0x10]\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ "ldr q22, [x11, #0x0]\n"
+ "ldr q21, [x11, #0x10]\n"
+ ".inst 0x6e58ec08 // bfmmla v8.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec0e // bfmmla v14.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q23, [x10, #0x10]\n"
+ ".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x9, #0x0]\n"
+ "ldr q21, [x9, #0x10]\n"
+ ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q23, [x28, #0x10]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x27, #0x0]\n"
+ "ldr q21, [x27, #0x10]\n"
+ ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"add x12, x12, #0x20\n"
"add x11, x11, #0x20\n"
"add x10, x10, #0x20\n"
@@ -904,33 +904,33 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"uzp2 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 74f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v4.4s, v4.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmax v4.4s, v4.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "fmin v4.4s, v4.4s, v22.4s\n"
+ "fmin v14.4s, v14.4s, v22.4s\n"
+ "fmin v15.4s, v15.4s, v22.4s\n"
+ "fmin v16.4s, v16.4s, v22.4s\n"
+ "fmin v17.4s, v17.4s, v22.4s\n"
+ "fmin v18.4s, v18.4s, v22.4s\n"
+ "fmin v8.4s, v8.4s, v22.4s\n"
+ "fmin v9.4s, v9.4s, v22.4s\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "fmin v13.4s, v13.4s, v22.4s\n"
+ "fmax v4.4s, v4.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
+ "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmax v8.4s, v8.4s, v21.4s\n"
+ "fmax v9.4s, v9.4s, v21.4s\n"
+ "fmax v10.4s, v10.4s, v21.4s\n"
+ "fmax v11.4s, v11.4s, v21.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
"74:" // Height 2: No activation
"cmp x14, #0x18\n"
"bge 87f\n"
@@ -1339,13 +1339,13 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"109:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 110f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 111f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -1354,8 +1354,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"b 111f\n"
"110:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"111:" // Height 3: input setup done
"cmp x25, #0x4\n"
"blt 114f\n"
@@ -1386,7 +1386,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"cmp x25, #0x8\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
+ "ldr q3, [x9, #0x10]\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
"add x12, x12, #0x20\n"
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
@@ -1399,10 +1399,10 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"add x10, x10, #0x20\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec11 // bfmmla v17.4s, v0.8h, v3.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ ".inst 0x6e43ec5d // bfmmla v29.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x27, #0x10]\n"
"add x28, x28, #0x20\n"
".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
"add x27, x27, #0x20\n"
@@ -1414,9 +1414,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
"ldr q6, [x11, #0x0]\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n"
"ld1 { v0.4s }, [x24], #0x10\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec5f // bfmmla v31.4s, v2.8h, v3.8h\n"
"ld1 { v2.4s }, [x22], #0x10\n"
"ldr q7, [x11, #0x10]\n"
"bge 112b\n"
@@ -1427,10 +1427,10 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"sub x25, x25, #0x4\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ "ldr q4, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"add x12, x12, #0x20\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
@@ -1438,31 +1438,31 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"add x11, x11, #0x20\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ "ldr q1, [x9, #0x10]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
+ "ldr q5, [x28, #0x0]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x10]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
"add x28, x28, #0x20\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ "ldr q3, [x27, #0x0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x27, #0x10]\n"
"add x27, x27, #0x20\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"114:" // Height 3: Multiply loop: Main loop skip
"cbz x25, 117f\n"
"cbz x25, 117f\n"
@@ -1480,51 +1480,51 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr s1, [x23, #0x0]\n"
"ldr s2, [x22, #0x0]\n"
"116:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q4, [x12, #0x0]\n"
- "ldr q5, [x12, #0x10]\n"
+ "ldr q5, [x12, #0x0]\n"
+ "ldr q4, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ldr q6, [x11, #0x0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q3, [x11, #0x0]\n"
+ "ldr q1, [x11, #0x10]\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x10, #0x0]\n"
+ ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x10, #0x10]\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x9, #0x0]\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e41ec5b // bfmmla v27.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x9, #0x10]\n"
+ ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x28, #0x0]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
"add x28, x28, #0x20\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x27, #0x0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x27, #0x10]\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
"add x27, x27, #0x20\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"117:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -2070,14 +2070,14 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"153:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 154f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 155f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -2087,9 +2087,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"b 155f\n"
"154:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"155:" // Height 4: input setup done
"cmp x25, #0x4\n"
"blt 158f\n"
@@ -2167,40 +2167,40 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"add x11, x11, #0x20\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ "ldr q4, [x10, #0x10]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"add x10, x10, #0x20\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x9, #0x0]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ "ldr q1, [x9, #0x10]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
+ ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
+ "ldr q5, [x28, #0x0]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x10]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
"add x28, x28, #0x20\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ "ldr q3, [x27, #0x0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x27, #0x10]\n"
"add x27, x27, #0x20\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"158:" // Height 4: Multiply loop: Main loop skip
"cbz x25, 161f\n"
"cbz x25, 161f\n"
@@ -2221,52 +2221,52 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
"ldr s2, [x22, #0x0]\n"
"ldr s3, [x21, #0x0]\n"
"160:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q4, [x12, #0x0]\n"
- "ldr q5, [x12, #0x10]\n"
+ "ldr q5, [x12, #0x0]\n"
+ "ldr q4, [x12, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "ldr q6, [x11, #0x0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x11, #0x0]\n"
+ "ldr q6, [x11, #0x10]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x10, #0x0]\n"
+ ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x10, #0x0]\n"
"add x12, x12, #0x20\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x10, #0x10]\n"
+ ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x10, #0x10]\n"
"add x11, x11, #0x20\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x9, #0x0]\n"
+ ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
+ "ldr q3, [x9, #0x0]\n"
"add x10, x10, #0x20\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x9, #0x10]\n"
+ ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ "ldr q1, [x9, #0x10]\n"
"add x9, x9, #0x20\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x10]\n"
+ ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x28, #0x0]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x10]\n"
"add x28, x28, #0x20\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x27, #0x0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x27, #0x10]\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x27, #0x0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x27, #0x10]\n"
"add x27, x27, #0x20\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"161:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
index e24dab68e8..745f89eff6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
index 2458d6a035..5f4fcac690 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
@@ -52,29 +52,29 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
__asm__ __volatile__(
"1:" // Height loop
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
- "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x23, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "add x22, x25, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "cmp x24, #0x8\n"
- "mov %x[Apanel], x23\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
- "cmp x24, #0x4\n"
- "mov x21, x25\n"
+ "cmp x25, #0x4\n"
+ "mov x21, x23\n"
"bgt 3f\n"
- "mov x22, x25\n"
+ "mov x22, x23\n"
"3:" // B setup done
"ldr q0, [%x[Apanel], #0x0]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
"movi v8.16b, #0x0\n"
- "ldr q4, [x25, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
"ldr q5, [x22, #0x0]\n"
"movi v9.16b, #0x0\n"
"ldr q6, [x21, #0x0]\n"
@@ -104,8 +104,8 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
"movi v31.16b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
+ "ldr q3, [%x[Apanel], #0x20]\n"
+ "ldr q7, [%x[Apanel], #0x30]\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
".inst 0x4f60f08b // bfdot v11.4s, v4.8h, v0.h[1]\n"
".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n"
@@ -117,11 +117,11 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
".inst 0x4f41f89a // bfdot v26.4s, v4.8h, v1.h[2]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
".inst 0x4f61f89d // bfdot v29.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [x25, #0x10]\n"
+ "ldr q4, [x23, #0x10]\n"
".inst 0x4f40f0a9 // bfdot v9.4s, v5.8h, v0.h[0]\n"
".inst 0x4f60f0ac // bfdot v12.4s, v5.8h, v0.h[1]\n"
".inst 0x4f40f8af // bfdot v15.4s, v5.8h, v0.h[2]\n"
- "add x25, x25, #0x20\n"
+ "add x23, x23, #0x20\n"
".inst 0x4f60f8b2 // bfdot v18.4s, v5.8h, v0.h[3]\n"
".inst 0x4f41f0b5 // bfdot v21.4s, v5.8h, v1.h[0]\n"
".inst 0x4f61f0b8 // bfdot v24.4s, v5.8h, v1.h[1]\n"
@@ -138,35 +138,35 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
".inst 0x4f61f0d9 // bfdot v25.4s, v6.8h, v1.h[1]\n"
".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n"
".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [x21, #0x10]\n"
+ "ldr q2, [x21, #0x10]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
"add x21, x21, #0x20\n"
- ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
- ".inst 0x4f62f08b // bfdot v11.4s, v4.8h, v2.h[1]\n"
- ".inst 0x4f42f88e // bfdot v14.4s, v4.8h, v2.h[2]\n"
- ".inst 0x4f62f891 // bfdot v17.4s, v4.8h, v2.h[3]\n"
- ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
- ".inst 0x4f63f097 // bfdot v23.4s, v4.8h, v3.h[1]\n"
- ".inst 0x4f43f89a // bfdot v26.4s, v4.8h, v3.h[2]\n"
- ".inst 0x4f63f89d // bfdot v29.4s, v4.8h, v3.h[3]\n"
- "ldr q4, [x25, #0x0]\n"
- ".inst 0x4f42f0a9 // bfdot v9.4s, v5.8h, v2.h[0]\n"
- ".inst 0x4f62f0ac // bfdot v12.4s, v5.8h, v2.h[1]\n"
- ".inst 0x4f42f8af // bfdot v15.4s, v5.8h, v2.h[2]\n"
- ".inst 0x4f62f8b2 // bfdot v18.4s, v5.8h, v2.h[3]\n"
- ".inst 0x4f43f0b5 // bfdot v21.4s, v5.8h, v3.h[0]\n"
- ".inst 0x4f63f0b8 // bfdot v24.4s, v5.8h, v3.h[1]\n"
- ".inst 0x4f43f8bb // bfdot v27.4s, v5.8h, v3.h[2]\n"
- ".inst 0x4f63f8be // bfdot v30.4s, v5.8h, v3.h[3]\n"
+ ".inst 0x4f43f088 // bfdot v8.4s, v4.8h, v3.h[0]\n"
+ ".inst 0x4f63f08b // bfdot v11.4s, v4.8h, v3.h[1]\n"
+ ".inst 0x4f43f88e // bfdot v14.4s, v4.8h, v3.h[2]\n"
+ ".inst 0x4f63f891 // bfdot v17.4s, v4.8h, v3.h[3]\n"
+ ".inst 0x4f47f094 // bfdot v20.4s, v4.8h, v7.h[0]\n"
+ ".inst 0x4f67f097 // bfdot v23.4s, v4.8h, v7.h[1]\n"
+ ".inst 0x4f47f89a // bfdot v26.4s, v4.8h, v7.h[2]\n"
+ ".inst 0x4f67f89d // bfdot v29.4s, v4.8h, v7.h[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ ".inst 0x4f43f0a9 // bfdot v9.4s, v5.8h, v3.h[0]\n"
+ ".inst 0x4f63f0ac // bfdot v12.4s, v5.8h, v3.h[1]\n"
+ ".inst 0x4f43f8af // bfdot v15.4s, v5.8h, v3.h[2]\n"
+ ".inst 0x4f63f8b2 // bfdot v18.4s, v5.8h, v3.h[3]\n"
+ ".inst 0x4f47f0b5 // bfdot v21.4s, v5.8h, v7.h[0]\n"
+ ".inst 0x4f67f0b8 // bfdot v24.4s, v5.8h, v7.h[1]\n"
+ ".inst 0x4f47f8bb // bfdot v27.4s, v5.8h, v7.h[2]\n"
+ ".inst 0x4f67f8be // bfdot v30.4s, v5.8h, v7.h[3]\n"
"ldr q5, [x22, #0x0]\n"
- ".inst 0x4f42f0ca // bfdot v10.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f62f0cd // bfdot v13.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f63f0d9 // bfdot v25.4s, v6.8h, v3.h[1]\n"
- ".inst 0x4f43f8dc // bfdot v28.4s, v6.8h, v3.h[2]\n"
- ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f43f04a // bfdot v10.4s, v2.8h, v3.h[0]\n"
+ ".inst 0x4f63f04d // bfdot v13.4s, v2.8h, v3.h[1]\n"
+ ".inst 0x4f43f850 // bfdot v16.4s, v2.8h, v3.h[2]\n"
+ ".inst 0x4f63f853 // bfdot v19.4s, v2.8h, v3.h[3]\n"
+ ".inst 0x4f47f056 // bfdot v22.4s, v2.8h, v7.h[0]\n"
+ ".inst 0x4f67f059 // bfdot v25.4s, v2.8h, v7.h[1]\n"
+ ".inst 0x4f47f85c // bfdot v28.4s, v2.8h, v7.h[2]\n"
+ ".inst 0x4f67f85f // bfdot v31.4s, v2.8h, v7.h[3]\n"
"ldr q6, [x21, #0x0]\n"
"bge 4b\n"
"5:" // main loop skip
@@ -175,7 +175,7 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
"add %x[Apanel], %x[Apanel], #0x20\n"
".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n"
".inst 0x4f60f891 // bfdot v17.4s, v4.8h, v0.h[3]\n"
- "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
".inst 0x4f61f097 // bfdot v23.4s, v4.8h, v1.h[1]\n"
"add x22, x22, #0x10\n"
@@ -199,38 +199,38 @@ void a64_ffinterleaved_bf16fp32_dot_8x12(
".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n"
".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
"cbz x20, 6f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "ldr q4, [%x[Apanel], #0x0]\n"
+ "ldr q3, [%x[Apanel], #0x10]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x25, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- ".inst 0x4f40f0e8 // bfdot v8.4s, v7.8h, v0.h[0]\n"
- "ldr q5, [x21, #0x0]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f40f8ee // bfdot v14.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f60f8f1 // bfdot v17.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f41f0f4 // bfdot v20.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f61f0f7 // bfdot v23.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f41f8fa // bfdot v26.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f61f8fd // bfdot v29.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f40f089 // bfdot v9.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f08c // bfdot v12.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f88f // bfdot v15.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f892 // bfdot v18.4s, v4.8h, v0.h[3]\n"
- ".inst 0x4f41f095 // bfdot v21.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f098 // bfdot v24.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89b // bfdot v27.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89e // bfdot v30.4s, v4.8h, v1.h[3]\n"
- ".inst 0x4f40f0aa // bfdot v10.4s, v5.8h, v0.h[0]\n"
- ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
- ".inst 0x4f40f8b0 // bfdot v16.4s, v5.8h, v0.h[2]\n"
- ".inst 0x4f60f8b3 // bfdot v19.4s, v5.8h, v0.h[3]\n"
- ".inst 0x4f41f0b6 // bfdot v22.4s, v5.8h, v1.h[0]\n"
- ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
- ".inst 0x4f41f8bc // bfdot v28.4s, v5.8h, v1.h[2]\n"
- ".inst 0x4f61f8bf // bfdot v31.4s, v5.8h, v1.h[3]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ ".inst 0x4f44f048 // bfdot v8.4s, v2.8h, v4.h[0]\n"
+ "ldr q0, [x21, #0x0]\n"
+ ".inst 0x4f64f04b // bfdot v11.4s, v2.8h, v4.h[1]\n"
+ ".inst 0x4f44f84e // bfdot v14.4s, v2.8h, v4.h[2]\n"
+ ".inst 0x4f64f851 // bfdot v17.4s, v2.8h, v4.h[3]\n"
+ ".inst 0x4f43f054 // bfdot v20.4s, v2.8h, v3.h[0]\n"
+ ".inst 0x4f63f057 // bfdot v23.4s, v2.8h, v3.h[1]\n"
+ ".inst 0x4f43f85a // bfdot v26.4s, v2.8h, v3.h[2]\n"
+ ".inst 0x4f63f85d // bfdot v29.4s, v2.8h, v3.h[3]\n"
+ ".inst 0x4f44f029 // bfdot v9.4s, v1.8h, v4.h[0]\n"
+ ".inst 0x4f64f02c // bfdot v12.4s, v1.8h, v4.h[1]\n"
+ ".inst 0x4f44f82f // bfdot v15.4s, v1.8h, v4.h[2]\n"
+ ".inst 0x4f64f832 // bfdot v18.4s, v1.8h, v4.h[3]\n"
+ ".inst 0x4f43f035 // bfdot v21.4s, v1.8h, v3.h[0]\n"
+ ".inst 0x4f63f038 // bfdot v24.4s, v1.8h, v3.h[1]\n"
+ ".inst 0x4f43f83b // bfdot v27.4s, v1.8h, v3.h[2]\n"
+ ".inst 0x4f63f83e // bfdot v30.4s, v1.8h, v3.h[3]\n"
+ ".inst 0x4f44f00a // bfdot v10.4s, v0.8h, v4.h[0]\n"
+ ".inst 0x4f64f00d // bfdot v13.4s, v0.8h, v4.h[1]\n"
+ ".inst 0x4f44f810 // bfdot v16.4s, v0.8h, v4.h[2]\n"
+ ".inst 0x4f64f813 // bfdot v19.4s, v0.8h, v4.h[3]\n"
+ ".inst 0x4f43f016 // bfdot v22.4s, v0.8h, v3.h[0]\n"
+ ".inst 0x4f63f019 // bfdot v25.4s, v0.8h, v3.h[1]\n"
+ ".inst 0x4f43f81c // bfdot v28.4s, v0.8h, v3.h[2]\n"
+ ".inst 0x4f63f81f // bfdot v31.4s, v0.8h, v3.h[3]\n"
"6:" // multiply loop done
- "subs x24, x24, #0xc\n"
+ "subs x25, x25, #0xc\n"
"str q8, [%x[Cpanel], #0x0]\n"
"str q9, [%x[Cpanel], #0x10]\n"
"str q10, [%x[Cpanel], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
index c61315b80a..cf4d74266a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
index 47991114af..4a1c1b5638 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -52,37 +52,37 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
__asm__ __volatile__(
"1:" // Height loop
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
- "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x23, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "add x22, x25, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "cmp x24, #0x8\n"
- "mov %x[Apanel], x23\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
- "cmp x24, #0x4\n"
- "mov x21, x25\n"
+ "cmp x25, #0x4\n"
+ "mov x21, x23\n"
"bgt 3f\n"
- "mov x22, x25\n"
+ "mov x22, x23\n"
"3:" // B setup done
- "ldr q4, [x25, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
"ldr q0, [%x[Apanel], #0x0]\n"
"movi v8.16b, #0x0\n"
"ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q5, [x25, #0x10]\n"
+ "ldr q5, [x23, #0x10]\n"
"movi v9.16b, #0x0\n"
"ldr q2, [%x[Apanel], #0x20]\n"
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"cmp x20, #0x2\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
- "add x25, x25, #0x20\n"
+ "add x23, x23, #0x20\n"
"movi v12.16b, #0x0\n"
"movi v13.16b, #0x0\n"
"add %x[Apanel], %x[Apanel], #0x30\n"
@@ -106,31 +106,31 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
"movi v31.16b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ldr q3, [%x[Apanel], #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
+ "ldr q6, [%x[Apanel], #0x0]\n"
+ "ldr q7, [x22, #0x0]\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q7, [x22, #0x10]\n"
+ "ldr q3, [x22, #0x10]\n"
".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"sub x20, x20, #0x2\n"
".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n"
"ldr q4, [x21, #0x0]\n"
- ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n"
"ldr q5, [x21, #0x10]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0c // bfmmla v12.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
"cmp x20, #0x2\n"
- ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
- "ldr q6, [x25, #0x0]\n"
- ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
- "ldr q7, [x25, #0x10]\n"
+ ".inst 0x6e43ec32 // bfmmla v18.4s, v1.8h, v3.8h\n"
+ ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec58 // bfmmla v24.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e47ecdb // bfmmla v27.4s, v6.8h, v7.8h\n"
+ "ldr q7, [x23, #0x0]\n"
+ ".inst 0x6e43ecde // bfmmla v30.4s, v6.8h, v3.8h\n"
+ "ldr q3, [x23, #0x10]\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
"ldr q0, [%x[Apanel], #0x10]\n"
@@ -140,22 +140,22 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
"ldr q2, [%x[Apanel], #0x30]\n"
- ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n"
"ldr q4, [x22, #0x20]\n"
- ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
- "ldr q3, [%x[Apanel], #0x40]\n"
+ ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n"
+ "ldr q6, [%x[Apanel], #0x40]\n"
"ldr q5, [x22, #0x30]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n"
"add x22, x22, #0x40\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
- "ldr q6, [x21, #0x20]\n"
- ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
- "ldr q7, [x21, #0x30]\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n"
+ "ldr q7, [x21, #0x20]\n"
+ ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n"
+ "ldr q3, [x21, #0x30]\n"
".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
@@ -163,23 +163,23 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
"add x21, x21, #0x40\n"
".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
- "ldr q4, [x25, #0x20]\n"
- ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
- "ldr q5, [x25, #0x30]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e44ecdb // bfmmla v27.4s, v6.8h, v4.8h\n"
+ "ldr q4, [x23, #0x20]\n"
+ ".inst 0x6e45ecde // bfmmla v30.4s, v6.8h, v5.8h\n"
+ "ldr q5, [x23, #0x30]\n"
+ ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
"ldr q0, [%x[Apanel], #0x50]\n"
- ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n"
"ldr q1, [%x[Apanel], #0x60]\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
"ldr q2, [%x[Apanel], #0x70]\n"
- ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n"
+ ".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
- "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
"bge 4b\n"
"5:" // main loop skip
"ldr q3, [%x[Apanel], #0x0]\n"
@@ -215,88 +215,88 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12(
".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
"cbz x20, 6f\n"
- "ldr q6, [x25, #0x0]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q7, [x25, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q5, [x22, #0x10]\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ "ldr q1, [x23, #0x0]\n"
+ "ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
+ "ldr q6, [%x[Apanel], #0x10]\n"
+ "ldr q0, [x23, #0x10]\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ "ldr q5, [%x[Apanel], #0x20]\n"
+ "ldr q4, [%x[Apanel], #0x30]\n"
+ ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q2, [x22, #0x10]\n"
+ ".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n"
+ ".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
- "ldr q6, [x21, #0x0]\n"
- ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
- "ldr q7, [x21, #0x10]\n"
- ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
- ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
- ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
- ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
- ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x21, #0x0]\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x21, #0x10]\n"
+ ".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n"
+ ".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n"
+ ".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n"
+ ".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n"
+ ".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n"
+ ".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e42ec9e // bfmmla v30.4s, v4.8h, v2.8h\n"
+ ".inst 0x6e41ecea // bfmmla v10.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e40eced // bfmmla v13.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e41ecd0 // bfmmla v16.4s, v6.8h, v1.8h\n"
+ ".inst 0x6e40ecd3 // bfmmla v19.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e41ecb6 // bfmmla v22.4s, v5.8h, v1.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ ".inst 0x6e41ec9c // bfmmla v28.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"6:" // multiply loop done
- "subs x24, x24, #0xc\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "subs x25, x25, #0xc\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
index 1495306879..b9b4ad54df 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
index 36bfccf52f..1e3f2f300b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
@@ -51,27 +51,27 @@ void a64_ffinterleaved_fp16_mla_8x24(
__asm__ __volatile__(
"1:" // Height loop
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
- "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x23, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "add x22, x25, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "cmp x24, #0x10\n"
- "mov %x[Apanel], x23\n"
+ "cmp x25, #0x10\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
- "cmp x24, #0x8\n"
- "mov x21, x25\n"
+ "cmp x25, #0x8\n"
+ "mov x21, x23\n"
"bgt 3f\n"
- "mov x22, x25\n"
+ "mov x22, x23\n"
"3:" // B setup done
"ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
"movi v8.16b, #0x0\n"
"ldr q3, [x22, #0x0]\n"
"ldr q4, [x21, #0x0]\n"
@@ -102,11 +102,11 @@ void a64_ffinterleaved_fp16_mla_8x24(
"movi v31.16b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q5, [x25, #0x10]\n"
+ "ldr q7, [%x[Apanel], #0x10]\n"
+ "ldr q6, [x23, #0x10]\n"
"fmla v8.8h, v2.8h, v0.h[0]\n"
- "ldr q6, [x22, #0x10]\n"
- "ldr q7, [x21, #0x10]\n"
+ "ldr q5, [x22, #0x10]\n"
+ "ldr q1, [x21, #0x10]\n"
"fmla v11.8h, v2.8h, v0.h[1]\n"
"fmla v14.8h, v2.8h, v0.h[2]\n"
"fmla v17.8h, v2.8h, v0.h[3]\n"
@@ -119,8 +119,8 @@ void a64_ffinterleaved_fp16_mla_8x24(
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla v9.8h, v3.8h, v0.h[0]\n"
"fmla v12.8h, v3.8h, v0.h[1]\n"
- "add x25, x25, #0x20\n"
- "ldr q2, [x25, #0x0]\n"
+ "add x23, x23, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v15.8h, v3.8h, v0.h[2]\n"
"fmla v18.8h, v3.8h, v0.h[3]\n"
"fmla v21.8h, v3.8h, v0.h[4]\n"
@@ -140,30 +140,30 @@ void a64_ffinterleaved_fp16_mla_8x24(
"fmla v31.8h, v4.8h, v0.h[7]\n"
"ldr q0, [%x[Apanel], #0x0]\n"
"ldr q4, [x21, #0x0]\n"
- "fmla v8.8h, v5.8h, v1.h[0]\n"
- "fmla v11.8h, v5.8h, v1.h[1]\n"
- "fmla v14.8h, v5.8h, v1.h[2]\n"
- "fmla v17.8h, v5.8h, v1.h[3]\n"
- "fmla v20.8h, v5.8h, v1.h[4]\n"
- "fmla v23.8h, v5.8h, v1.h[5]\n"
- "fmla v26.8h, v5.8h, v1.h[6]\n"
- "fmla v29.8h, v5.8h, v1.h[7]\n"
- "fmla v9.8h, v6.8h, v1.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v15.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v1.h[3]\n"
- "fmla v21.8h, v6.8h, v1.h[4]\n"
- "fmla v24.8h, v6.8h, v1.h[5]\n"
- "fmla v27.8h, v6.8h, v1.h[6]\n"
- "fmla v30.8h, v6.8h, v1.h[7]\n"
- "fmla v10.8h, v7.8h, v1.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v16.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v1.h[3]\n"
- "fmla v22.8h, v7.8h, v1.h[4]\n"
- "fmla v25.8h, v7.8h, v1.h[5]\n"
- "fmla v28.8h, v7.8h, v1.h[6]\n"
- "fmla v31.8h, v7.8h, v1.h[7]\n"
+ "fmla v8.8h, v6.8h, v7.h[0]\n"
+ "fmla v11.8h, v6.8h, v7.h[1]\n"
+ "fmla v14.8h, v6.8h, v7.h[2]\n"
+ "fmla v17.8h, v6.8h, v7.h[3]\n"
+ "fmla v20.8h, v6.8h, v7.h[4]\n"
+ "fmla v23.8h, v6.8h, v7.h[5]\n"
+ "fmla v26.8h, v6.8h, v7.h[6]\n"
+ "fmla v29.8h, v6.8h, v7.h[7]\n"
+ "fmla v9.8h, v5.8h, v7.h[0]\n"
+ "fmla v12.8h, v5.8h, v7.h[1]\n"
+ "fmla v15.8h, v5.8h, v7.h[2]\n"
+ "fmla v18.8h, v5.8h, v7.h[3]\n"
+ "fmla v21.8h, v5.8h, v7.h[4]\n"
+ "fmla v24.8h, v5.8h, v7.h[5]\n"
+ "fmla v27.8h, v5.8h, v7.h[6]\n"
+ "fmla v30.8h, v5.8h, v7.h[7]\n"
+ "fmla v10.8h, v1.8h, v7.h[0]\n"
+ "fmla v13.8h, v1.8h, v7.h[1]\n"
+ "fmla v16.8h, v1.8h, v7.h[2]\n"
+ "fmla v19.8h, v1.8h, v7.h[3]\n"
+ "fmla v22.8h, v1.8h, v7.h[4]\n"
+ "fmla v25.8h, v1.8h, v7.h[5]\n"
+ "fmla v28.8h, v1.8h, v7.h[6]\n"
+ "fmla v31.8h, v1.8h, v7.h[7]\n"
"bge 4b\n"
"5:" // main loop skip
"fmla v8.8h, v2.8h, v0.h[0]\n"
@@ -171,7 +171,7 @@ void a64_ffinterleaved_fp16_mla_8x24(
"add %x[Apanel], %x[Apanel], #0x10\n"
"fmla v14.8h, v2.8h, v0.h[2]\n"
"fmla v17.8h, v2.8h, v0.h[3]\n"
- "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v20.8h, v2.8h, v0.h[4]\n"
"fmla v23.8h, v2.8h, v0.h[5]\n"
"add x22, x22, #0x10\n"
@@ -195,37 +195,37 @@ void a64_ffinterleaved_fp16_mla_8x24(
"fmla v28.8h, v4.8h, v0.h[6]\n"
"fmla v31.8h, v4.8h, v0.h[7]\n"
"cbz x20, 6f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q5, [x25, #0x0]\n"
- "fmla v8.8h, v5.8h, v0.h[0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q7, [x21, #0x0]\n"
- "fmla v11.8h, v5.8h, v0.h[1]\n"
- "fmla v14.8h, v5.8h, v0.h[2]\n"
- "fmla v17.8h, v5.8h, v0.h[3]\n"
+ "ldr q3, [%x[Apanel], #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "fmla v8.8h, v2.8h, v3.h[0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q0, [x21, #0x0]\n"
+ "fmla v11.8h, v2.8h, v3.h[1]\n"
+ "fmla v14.8h, v2.8h, v3.h[2]\n"
+ "fmla v17.8h, v2.8h, v3.h[3]\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v20.8h, v5.8h, v0.h[4]\n"
- "fmla v23.8h, v5.8h, v0.h[5]\n"
- "fmla v26.8h, v5.8h, v0.h[6]\n"
- "fmla v29.8h, v5.8h, v0.h[7]\n"
- "fmla v9.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v0.h[1]\n"
- "fmla v15.8h, v6.8h, v0.h[2]\n"
- "fmla v18.8h, v6.8h, v0.h[3]\n"
- "fmla v21.8h, v6.8h, v0.h[4]\n"
- "fmla v24.8h, v6.8h, v0.h[5]\n"
- "fmla v27.8h, v6.8h, v0.h[6]\n"
- "fmla v30.8h, v6.8h, v0.h[7]\n"
- "fmla v10.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v0.h[1]\n"
- "fmla v16.8h, v7.8h, v0.h[2]\n"
- "fmla v19.8h, v7.8h, v0.h[3]\n"
- "fmla v22.8h, v7.8h, v0.h[4]\n"
- "fmla v25.8h, v7.8h, v0.h[5]\n"
- "fmla v28.8h, v7.8h, v0.h[6]\n"
- "fmla v31.8h, v7.8h, v0.h[7]\n"
+ "fmla v20.8h, v2.8h, v3.h[4]\n"
+ "fmla v23.8h, v2.8h, v3.h[5]\n"
+ "fmla v26.8h, v2.8h, v3.h[6]\n"
+ "fmla v29.8h, v2.8h, v3.h[7]\n"
+ "fmla v9.8h, v1.8h, v3.h[0]\n"
+ "fmla v12.8h, v1.8h, v3.h[1]\n"
+ "fmla v15.8h, v1.8h, v3.h[2]\n"
+ "fmla v18.8h, v1.8h, v3.h[3]\n"
+ "fmla v21.8h, v1.8h, v3.h[4]\n"
+ "fmla v24.8h, v1.8h, v3.h[5]\n"
+ "fmla v27.8h, v1.8h, v3.h[6]\n"
+ "fmla v30.8h, v1.8h, v3.h[7]\n"
+ "fmla v10.8h, v0.8h, v3.h[0]\n"
+ "fmla v13.8h, v0.8h, v3.h[1]\n"
+ "fmla v16.8h, v0.8h, v3.h[2]\n"
+ "fmla v19.8h, v0.8h, v3.h[3]\n"
+ "fmla v22.8h, v0.8h, v3.h[4]\n"
+ "fmla v25.8h, v0.8h, v3.h[5]\n"
+ "fmla v28.8h, v0.8h, v3.h[6]\n"
+ "fmla v31.8h, v0.8h, v3.h[7]\n"
"6:" // multiply loop done
- "subs x24, x24, #0x18\n"
+ "subs x25, x25, #0x18\n"
"str q8, [%x[Cpanel], #0x0]\n"
"str q9, [%x[Cpanel], #0x10]\n"
"str q10, [%x[Cpanel], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
index f2a836c9b4..c4445ba14a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
index ec99d64f4a..6de0a380eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
@@ -51,29 +51,29 @@ void a64_ffinterleaved_fp32_mla_8x12(
__asm__ __volatile__(
"1:" // Height loop
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
- "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x23, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "add x22, x25, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "cmp x24, #0x8\n"
- "mov %x[Apanel], x23\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
- "cmp x24, #0x4\n"
- "mov x21, x25\n"
+ "cmp x25, #0x4\n"
+ "mov x21, x23\n"
"bgt 3f\n"
- "mov x22, x25\n"
+ "mov x22, x23\n"
"3:" // B setup done
"ldr q0, [%x[Apanel], #0x0]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
"movi v8.16b, #0x0\n"
- "ldr q4, [x25, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
"ldr q5, [x22, #0x0]\n"
"movi v9.16b, #0x0\n"
"ldr q6, [x21, #0x0]\n"
@@ -103,10 +103,10 @@ void a64_ffinterleaved_fp32_mla_8x12(
"movi v31.16b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
+ "ldr q3, [%x[Apanel], #0x20]\n"
+ "ldr q7, [%x[Apanel], #0x30]\n"
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q7, [x25, #0x10]\n"
+ "ldr q2, [x23, #0x10]\n"
"fmla v11.4s, v4.4s, v0.s[1]\n"
"fmla v14.4s, v4.4s, v0.s[2]\n"
"fmla v17.4s, v4.4s, v0.s[3]\n"
@@ -136,36 +136,36 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v28.4s, v6.4s, v1.s[2]\n"
"fmla v31.4s, v6.4s, v1.s[3]\n"
"ldr q1, [%x[Apanel], #0x50]\n"
- "ldr q6, [x25, #0x20]\n"
- "fmla v8.4s, v7.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v2.s[1]\n"
- "fmla v14.4s, v7.4s, v2.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v20.4s, v7.4s, v3.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v26.4s, v7.4s, v3.s[2]\n"
- "fmla v29.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x22, #0x20]\n"
- "fmla v9.4s, v4.4s, v2.s[0]\n"
- "fmla v12.4s, v4.4s, v2.s[1]\n"
- "fmla v15.4s, v4.4s, v2.s[2]\n"
- "fmla v18.4s, v4.4s, v2.s[3]\n"
- "fmla v21.4s, v4.4s, v3.s[0]\n"
- "fmla v24.4s, v4.4s, v3.s[1]\n"
- "fmla v27.4s, v4.4s, v3.s[2]\n"
- "fmla v30.4s, v4.4s, v3.s[3]\n"
+ "ldr q6, [x23, #0x20]\n"
+ "fmla v8.4s, v2.4s, v3.s[0]\n"
+ "fmla v11.4s, v2.4s, v3.s[1]\n"
+ "fmla v14.4s, v2.4s, v3.s[2]\n"
+ "fmla v17.4s, v2.4s, v3.s[3]\n"
+ "fmla v20.4s, v2.4s, v7.s[0]\n"
+ "fmla v23.4s, v2.4s, v7.s[1]\n"
+ "fmla v26.4s, v2.4s, v7.s[2]\n"
+ "fmla v29.4s, v2.4s, v7.s[3]\n"
+ "ldr q2, [x22, #0x20]\n"
+ "fmla v9.4s, v4.4s, v3.s[0]\n"
+ "fmla v12.4s, v4.4s, v3.s[1]\n"
+ "fmla v15.4s, v4.4s, v3.s[2]\n"
+ "fmla v18.4s, v4.4s, v3.s[3]\n"
+ "fmla v21.4s, v4.4s, v7.s[0]\n"
+ "fmla v24.4s, v4.4s, v7.s[1]\n"
+ "fmla v27.4s, v4.4s, v7.s[2]\n"
+ "fmla v30.4s, v4.4s, v7.s[3]\n"
"ldr q4, [x21, #0x20]\n"
- "fmla v10.4s, v5.4s, v2.s[0]\n"
- "fmla v13.4s, v5.4s, v2.s[1]\n"
- "fmla v16.4s, v5.4s, v2.s[2]\n"
- "fmla v19.4s, v5.4s, v2.s[3]\n"
- "ldr q2, [%x[Apanel], #0x60]\n"
- "fmla v22.4s, v5.4s, v3.s[0]\n"
- "fmla v25.4s, v5.4s, v3.s[1]\n"
- "fmla v28.4s, v5.4s, v3.s[2]\n"
- "fmla v31.4s, v5.4s, v3.s[3]\n"
- "ldr q3, [%x[Apanel], #0x70]\n"
- "ldr q5, [x25, #0x30]\n"
+ "fmla v10.4s, v5.4s, v3.s[0]\n"
+ "fmla v13.4s, v5.4s, v3.s[1]\n"
+ "fmla v16.4s, v5.4s, v3.s[2]\n"
+ "fmla v19.4s, v5.4s, v3.s[3]\n"
+ "ldr q3, [%x[Apanel], #0x60]\n"
+ "fmla v22.4s, v5.4s, v7.s[0]\n"
+ "fmla v25.4s, v5.4s, v7.s[1]\n"
+ "fmla v28.4s, v5.4s, v7.s[2]\n"
+ "fmla v31.4s, v5.4s, v7.s[3]\n"
+ "ldr q7, [%x[Apanel], #0x70]\n"
+ "ldr q5, [x23, #0x30]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v11.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v0.s[2]\n"
@@ -173,20 +173,20 @@ void a64_ffinterleaved_fp32_mla_8x12(
"add %x[Apanel], %x[Apanel], #0x80\n"
"fmla v20.4s, v6.4s, v1.s[0]\n"
"fmla v23.4s, v6.4s, v1.s[1]\n"
- "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
"fmla v26.4s, v6.4s, v1.s[2]\n"
"fmla v29.4s, v6.4s, v1.s[3]\n"
"ldr q6, [x22, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v12.4s, v7.4s, v0.s[1]\n"
+ "fmla v9.4s, v2.4s, v0.s[0]\n"
+ "fmla v12.4s, v2.4s, v0.s[1]\n"
"add x22, x22, #0x40\n"
- "fmla v15.4s, v7.4s, v0.s[2]\n"
- "fmla v18.4s, v7.4s, v0.s[3]\n"
- "fmla v21.4s, v7.4s, v1.s[0]\n"
- "fmla v24.4s, v7.4s, v1.s[1]\n"
- "fmla v27.4s, v7.4s, v1.s[2]\n"
- "fmla v30.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x21, #0x30]\n"
+ "fmla v15.4s, v2.4s, v0.s[2]\n"
+ "fmla v18.4s, v2.4s, v0.s[3]\n"
+ "fmla v21.4s, v2.4s, v1.s[0]\n"
+ "fmla v24.4s, v2.4s, v1.s[1]\n"
+ "fmla v27.4s, v2.4s, v1.s[2]\n"
+ "fmla v30.4s, v2.4s, v1.s[3]\n"
+ "ldr q2, [x21, #0x30]\n"
"fmla v10.4s, v4.4s, v0.s[0]\n"
"fmla v13.4s, v4.4s, v0.s[1]\n"
"add x21, x21, #0x40\n"
@@ -198,33 +198,33 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v28.4s, v4.4s, v1.s[2]\n"
"fmla v31.4s, v4.4s, v1.s[3]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q4, [x25, #0x0]\n"
- "fmla v8.4s, v5.4s, v2.s[0]\n"
- "fmla v11.4s, v5.4s, v2.s[1]\n"
- "fmla v14.4s, v5.4s, v2.s[2]\n"
- "fmla v17.4s, v5.4s, v2.s[3]\n"
- "fmla v20.4s, v5.4s, v3.s[0]\n"
- "fmla v23.4s, v5.4s, v3.s[1]\n"
- "fmla v26.4s, v5.4s, v3.s[2]\n"
- "fmla v29.4s, v5.4s, v3.s[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "fmla v8.4s, v5.4s, v3.s[0]\n"
+ "fmla v11.4s, v5.4s, v3.s[1]\n"
+ "fmla v14.4s, v5.4s, v3.s[2]\n"
+ "fmla v17.4s, v5.4s, v3.s[3]\n"
+ "fmla v20.4s, v5.4s, v7.s[0]\n"
+ "fmla v23.4s, v5.4s, v7.s[1]\n"
+ "fmla v26.4s, v5.4s, v7.s[2]\n"
+ "fmla v29.4s, v5.4s, v7.s[3]\n"
"ldr q5, [x22, #0x0]\n"
- "fmla v9.4s, v6.4s, v2.s[0]\n"
- "fmla v12.4s, v6.4s, v2.s[1]\n"
- "fmla v15.4s, v6.4s, v2.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v21.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v3.s[1]\n"
- "fmla v27.4s, v6.4s, v3.s[2]\n"
- "fmla v30.4s, v6.4s, v3.s[3]\n"
+ "fmla v9.4s, v6.4s, v3.s[0]\n"
+ "fmla v12.4s, v6.4s, v3.s[1]\n"
+ "fmla v15.4s, v6.4s, v3.s[2]\n"
+ "fmla v18.4s, v6.4s, v3.s[3]\n"
+ "fmla v21.4s, v6.4s, v7.s[0]\n"
+ "fmla v24.4s, v6.4s, v7.s[1]\n"
+ "fmla v27.4s, v6.4s, v7.s[2]\n"
+ "fmla v30.4s, v6.4s, v7.s[3]\n"
"ldr q6, [x21, #0x0]\n"
- "fmla v10.4s, v7.4s, v2.s[0]\n"
- "fmla v13.4s, v7.4s, v2.s[1]\n"
- "fmla v16.4s, v7.4s, v2.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v22.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v3.s[1]\n"
- "fmla v28.4s, v7.4s, v3.s[2]\n"
- "fmla v31.4s, v7.4s, v3.s[3]\n"
+ "fmla v10.4s, v2.4s, v3.s[0]\n"
+ "fmla v13.4s, v2.4s, v3.s[1]\n"
+ "fmla v16.4s, v2.4s, v3.s[2]\n"
+ "fmla v19.4s, v2.4s, v3.s[3]\n"
+ "fmla v22.4s, v2.4s, v7.s[0]\n"
+ "fmla v25.4s, v2.4s, v7.s[1]\n"
+ "fmla v28.4s, v2.4s, v7.s[2]\n"
+ "fmla v31.4s, v2.4s, v7.s[3]\n"
"bge 4b\n"
"5:" // main loop skip
"fmla v8.4s, v4.4s, v0.s[0]\n"
@@ -232,7 +232,7 @@ void a64_ffinterleaved_fp32_mla_8x12(
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla v14.4s, v4.4s, v0.s[2]\n"
"fmla v17.4s, v4.4s, v0.s[3]\n"
- "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v20.4s, v4.4s, v1.s[0]\n"
"fmla v23.4s, v4.4s, v1.s[1]\n"
"add x22, x22, #0x10\n"
@@ -257,43 +257,43 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v31.4s, v6.4s, v1.s[3]\n"
"cbz x20, 7f\n"
"6:" // odd loop
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "ldr q4, [%x[Apanel], #0x0]\n"
+ "ldr q3, [%x[Apanel], #0x10]\n"
"subs x20, x20, #0x1\n"
- "ldr q7, [x25, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "fmla v8.4s, v7.4s, v0.s[0]\n"
- "ldr q5, [x21, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v14.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v0.s[3]\n"
- "fmla v20.4s, v7.4s, v1.s[0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "fmla v8.4s, v2.4s, v4.s[0]\n"
+ "ldr q0, [x21, #0x0]\n"
+ "fmla v11.4s, v2.4s, v4.s[1]\n"
+ "fmla v14.4s, v2.4s, v4.s[2]\n"
+ "fmla v17.4s, v2.4s, v4.s[3]\n"
+ "fmla v20.4s, v2.4s, v3.s[0]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v23.4s, v7.4s, v1.s[1]\n"
- "fmla v26.4s, v7.4s, v1.s[2]\n"
- "add x25, x25, #0x10\n"
- "fmla v29.4s, v7.4s, v1.s[3]\n"
- "fmla v9.4s, v4.4s, v0.s[0]\n"
+ "fmla v23.4s, v2.4s, v3.s[1]\n"
+ "fmla v26.4s, v2.4s, v3.s[2]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v29.4s, v2.4s, v3.s[3]\n"
+ "fmla v9.4s, v1.4s, v4.s[0]\n"
"add x22, x22, #0x10\n"
- "fmla v12.4s, v4.4s, v0.s[1]\n"
- "fmla v15.4s, v4.4s, v0.s[2]\n"
+ "fmla v12.4s, v1.4s, v4.s[1]\n"
+ "fmla v15.4s, v1.4s, v4.s[2]\n"
"add x21, x21, #0x10\n"
- "fmla v18.4s, v4.4s, v0.s[3]\n"
- "fmla v21.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v1.s[1]\n"
- "fmla v27.4s, v4.4s, v1.s[2]\n"
- "fmla v30.4s, v4.4s, v1.s[3]\n"
- "fmla v10.4s, v5.4s, v0.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[1]\n"
- "fmla v16.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v0.s[3]\n"
- "fmla v22.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v1.s[1]\n"
- "fmla v28.4s, v5.4s, v1.s[2]\n"
- "fmla v31.4s, v5.4s, v1.s[3]\n"
+ "fmla v18.4s, v1.4s, v4.s[3]\n"
+ "fmla v21.4s, v1.4s, v3.s[0]\n"
+ "fmla v24.4s, v1.4s, v3.s[1]\n"
+ "fmla v27.4s, v1.4s, v3.s[2]\n"
+ "fmla v30.4s, v1.4s, v3.s[3]\n"
+ "fmla v10.4s, v0.4s, v4.s[0]\n"
+ "fmla v13.4s, v0.4s, v4.s[1]\n"
+ "fmla v16.4s, v0.4s, v4.s[2]\n"
+ "fmla v19.4s, v0.4s, v4.s[3]\n"
+ "fmla v22.4s, v0.4s, v3.s[0]\n"
+ "fmla v25.4s, v0.4s, v3.s[1]\n"
+ "fmla v28.4s, v0.4s, v3.s[2]\n"
+ "fmla v31.4s, v0.4s, v3.s[3]\n"
"bne 6b\n"
"7:" // multiply loop done
- "subs x24, x24, #0xc\n"
+ "subs x25, x25, #0xc\n"
"str q8, [%x[Cpanel], #0x0]\n"
"str q9, [%x[Cpanel], #0x10]\n"
"str q10, [%x[Cpanel], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
index 3b8770e153..f1427669ea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
index 02d2434356..fc323ea4fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -93,7 +93,6 @@ void a64_hybrid_bf16fp32_dot_6x16 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 176f\n"
@@ -190,11 +189,11 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -211,37 +210,37 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n"
+ ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n"
"ldr q0, [x26, #0x0]\n"
"cmp x27, #0x10\n"
"add x10, x10, #0x100\n"
@@ -251,37 +250,37 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x8\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n"
+ ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
@@ -289,31 +288,31 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x27, #0x2\n"
"blt 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr q16, [x10, #0x0]\n"
+ ".inst 0x4f52f208 // bfdot v8.4s, v16.8h, v18.h[0]\n"
"sub x27, x27, #0x2\n"
- "ldr q7, [x10, #0x10]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4f52f209 // bfdot v9.4s, v16.8h, v18.h[0]\n"
"cmp x27, #0x2\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f52f22a // bfdot v10.4s, v17.8h, v18.h[0]\n"
+ ".inst 0x4f52f20b // bfdot v11.4s, v16.8h, v18.h[0]\n"
"add x10, x10, #0x40\n"
"bge 21b\n"
"22:" // Height 1: Multiply loop: Skip odd blocks
"cbz x27, 24f\n"
"ldr h0, [x26, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x4f40f228 // bfdot v8.4s, v17.8h, v0.h[0]\n"
+ ".inst 0x4f40f209 // bfdot v9.4s, v16.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
+ ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
"add x10, x10, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -323,17 +322,17 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
"25:" // Height 1: No activation
"cmp x11, #0x10\n"
"bge 34f\n"
@@ -511,12 +510,12 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"50:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 51f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -524,7 +523,7 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"b 52f\n"
"51:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"52:" // Height 2: input setup done
"cmp x27, #0x8\n"
"blt 55f\n"
@@ -537,156 +536,156 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"53:" // Height 2: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"sub x27, x27, #0x8\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
+ ".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"add x25, x25, #0x10\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
+ ".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"cmp x27, #0x10\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n"
+ ".inst 0x4f61f22c // bfdot v12.4s, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n"
+ ".inst 0x4f61f20d // bfdot v13.4s, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n"
+ ".inst 0x4f61f22e // bfdot v14.4s, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n"
+ ".inst 0x4f61f20f // bfdot v15.4s, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n"
+ ".inst 0x4f41fa2c // bfdot v12.4s, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n"
+ ".inst 0x4f41fa0d // bfdot v13.4s, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n"
+ ".inst 0x4f41fa2e // bfdot v14.4s, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n"
+ ".inst 0x4f41fa0f // bfdot v15.4s, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n"
+ ".inst 0x4f61fa2c // bfdot v12.4s, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n"
+ ".inst 0x4f61fa0d // bfdot v13.4s, v16.8h, v1.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n"
+ ".inst 0x4f61fa2e // bfdot v14.4s, v17.8h, v1.h[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f61fa0f // bfdot v15.4s, v16.8h, v1.h[3]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 53b\n"
"54:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"add x26, x26, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
+ ".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
+ ".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n"
+ ".inst 0x4f61f22c // bfdot v12.4s, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n"
+ ".inst 0x4f61f20d // bfdot v13.4s, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n"
+ ".inst 0x4f61f22e // bfdot v14.4s, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n"
+ ".inst 0x4f61f20f // bfdot v15.4s, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n"
+ ".inst 0x4f41fa2c // bfdot v12.4s, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n"
+ ".inst 0x4f41fa0d // bfdot v13.4s, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n"
+ ".inst 0x4f41fa2e // bfdot v14.4s, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n"
+ ".inst 0x4f41fa0f // bfdot v15.4s, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n"
+ ".inst 0x4f61fa2c // bfdot v12.4s, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n"
+ ".inst 0x4f61fa0d // bfdot v13.4s, v16.8h, v1.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n"
+ ".inst 0x4f61fa2e // bfdot v14.4s, v17.8h, v1.h[3]\n"
+ ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n"
+ ".inst 0x4f61fa0f // bfdot v15.4s, v16.8h, v1.h[3]\n"
"55:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 59f\n"
"cmp x27, #0x2\n"
"blt 57f\n"
"56:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
"cmp x27, #0x2\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x4f53f228 // bfdot v8.4s, v17.8h, v19.h[0]\n"
+ ".inst 0x4f52f22c // bfdot v12.4s, v17.8h, v18.h[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4f53f209 // bfdot v9.4s, v16.8h, v19.h[0]\n"
+ ".inst 0x4f52f20d // bfdot v13.4s, v16.8h, v18.h[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f53f22a // bfdot v10.4s, v17.8h, v19.h[0]\n"
+ ".inst 0x4f52f22e // bfdot v14.4s, v17.8h, v18.h[0]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f53f20b // bfdot v11.4s, v16.8h, v19.h[0]\n"
+ ".inst 0x4f52f20f // bfdot v15.4s, v16.8h, v18.h[0]\n"
"bge 56b\n"
"57:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 59f\n"
"ldr h0, [x26, #0x0]\n"
"ldr h1, [x25, #0x0]\n"
"58:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x4f40f228 // bfdot v8.4s, v17.8h, v0.h[0]\n"
+ ".inst 0x4f41f22c // bfdot v12.4s, v17.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4f40f209 // bfdot v9.4s, v16.8h, v0.h[0]\n"
+ ".inst 0x4f41f20d // bfdot v13.4s, v16.8h, v1.h[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n"
+ ".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n"
+ ".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n"
"59:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -698,25 +697,25 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 60f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmin v12.4s, v12.4s, v17.4s\n"
+ "fmin v13.4s, v13.4s, v17.4s\n"
+ "fmin v14.4s, v14.4s, v17.4s\n"
+ "fmin v15.4s, v15.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
+ "fmax v12.4s, v12.4s, v16.4s\n"
+ "fmax v13.4s, v13.4s, v16.4s\n"
+ "fmax v14.4s, v14.4s, v16.4s\n"
+ "fmax v15.4s, v15.4s, v16.4s\n"
"60:" // Height 2: No activation
"cmp x11, #0x10\n"
"bge 69f\n"
@@ -943,13 +942,13 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"85:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 86f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 87f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -958,8 +957,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"b 87f\n"
"86:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"87:" // Height 3: input setup done
"cmp x27, #0x8\n"
"blt 90f\n"
@@ -976,75 +975,75 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
"add x25, x25, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n"
+ ".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n"
"cmp x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ ".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n"
+ ".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n"
+ "ldr q20, [x10, #0x50]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f60f2a8 // bfdot v8.4s, v21.8h, v0.h[1]\n"
+ ".inst 0x4f61f2ac // bfdot v12.4s, v21.8h, v1.h[1]\n"
+ ".inst 0x4f62f2b0 // bfdot v16.4s, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ ".inst 0x4f60f289 // bfdot v9.4s, v20.8h, v0.h[1]\n"
+ ".inst 0x4f61f28d // bfdot v13.4s, v20.8h, v1.h[1]\n"
+ ".inst 0x4f62f291 // bfdot v17.4s, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ ".inst 0x4f60f2aa // bfdot v10.4s, v21.8h, v0.h[1]\n"
+ ".inst 0x4f61f2ae // bfdot v14.4s, v21.8h, v1.h[1]\n"
+ ".inst 0x4f62f2b2 // bfdot v18.4s, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ ".inst 0x4f60f28b // bfdot v11.4s, v20.8h, v0.h[1]\n"
+ ".inst 0x4f61f28f // bfdot v15.4s, v20.8h, v1.h[1]\n"
+ ".inst 0x4f62f293 // bfdot v19.4s, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ ".inst 0x4f40faa8 // bfdot v8.4s, v21.8h, v0.h[2]\n"
+ ".inst 0x4f41faac // bfdot v12.4s, v21.8h, v1.h[2]\n"
+ ".inst 0x4f42fab0 // bfdot v16.4s, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ ".inst 0x4f40fa89 // bfdot v9.4s, v20.8h, v0.h[2]\n"
+ ".inst 0x4f41fa8d // bfdot v13.4s, v20.8h, v1.h[2]\n"
+ ".inst 0x4f42fa91 // bfdot v17.4s, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ ".inst 0x4f40faaa // bfdot v10.4s, v21.8h, v0.h[2]\n"
+ ".inst 0x4f41faae // bfdot v14.4s, v21.8h, v1.h[2]\n"
+ ".inst 0x4f42fab2 // bfdot v18.4s, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ ".inst 0x4f40fa8b // bfdot v11.4s, v20.8h, v0.h[2]\n"
+ ".inst 0x4f41fa8f // bfdot v15.4s, v20.8h, v1.h[2]\n"
+ ".inst 0x4f42fa93 // bfdot v19.4s, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ ".inst 0x4f60faa8 // bfdot v8.4s, v21.8h, v0.h[3]\n"
+ ".inst 0x4f61faac // bfdot v12.4s, v21.8h, v1.h[3]\n"
+ ".inst 0x4f62fab0 // bfdot v16.4s, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ ".inst 0x4f60fa89 // bfdot v9.4s, v20.8h, v0.h[3]\n"
+ ".inst 0x4f61fa8d // bfdot v13.4s, v20.8h, v1.h[3]\n"
+ ".inst 0x4f62fa91 // bfdot v17.4s, v20.8h, v2.h[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f60faaa // bfdot v10.4s, v21.8h, v0.h[3]\n"
+ ".inst 0x4f61faae // bfdot v14.4s, v21.8h, v1.h[3]\n"
+ ".inst 0x4f62fab2 // bfdot v18.4s, v21.8h, v2.h[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f60fa8b // bfdot v11.4s, v20.8h, v0.h[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f61fa8f // bfdot v15.4s, v20.8h, v1.h[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f62fa93 // bfdot v19.4s, v20.8h, v2.h[3]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 88b\n"
@@ -1054,98 +1053,98 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
"add x24, x24, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n"
+ ".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ ".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n"
+ ".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n"
+ "ldr q20, [x10, #0x50]\n"
+ ".inst 0x4f60f2a8 // bfdot v8.4s, v21.8h, v0.h[1]\n"
+ ".inst 0x4f61f2ac // bfdot v12.4s, v21.8h, v1.h[1]\n"
+ ".inst 0x4f62f2b0 // bfdot v16.4s, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ ".inst 0x4f60f289 // bfdot v9.4s, v20.8h, v0.h[1]\n"
+ ".inst 0x4f61f28d // bfdot v13.4s, v20.8h, v1.h[1]\n"
+ ".inst 0x4f62f291 // bfdot v17.4s, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ ".inst 0x4f60f2aa // bfdot v10.4s, v21.8h, v0.h[1]\n"
+ ".inst 0x4f61f2ae // bfdot v14.4s, v21.8h, v1.h[1]\n"
+ ".inst 0x4f62f2b2 // bfdot v18.4s, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ ".inst 0x4f60f28b // bfdot v11.4s, v20.8h, v0.h[1]\n"
+ ".inst 0x4f61f28f // bfdot v15.4s, v20.8h, v1.h[1]\n"
+ ".inst 0x4f62f293 // bfdot v19.4s, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ ".inst 0x4f40faa8 // bfdot v8.4s, v21.8h, v0.h[2]\n"
+ ".inst 0x4f41faac // bfdot v12.4s, v21.8h, v1.h[2]\n"
+ ".inst 0x4f42fab0 // bfdot v16.4s, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ ".inst 0x4f40fa89 // bfdot v9.4s, v20.8h, v0.h[2]\n"
+ ".inst 0x4f41fa8d // bfdot v13.4s, v20.8h, v1.h[2]\n"
+ ".inst 0x4f42fa91 // bfdot v17.4s, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ ".inst 0x4f40faaa // bfdot v10.4s, v21.8h, v0.h[2]\n"
+ ".inst 0x4f41faae // bfdot v14.4s, v21.8h, v1.h[2]\n"
+ ".inst 0x4f42fab2 // bfdot v18.4s, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ ".inst 0x4f40fa8b // bfdot v11.4s, v20.8h, v0.h[2]\n"
+ ".inst 0x4f41fa8f // bfdot v15.4s, v20.8h, v1.h[2]\n"
+ ".inst 0x4f42fa93 // bfdot v19.4s, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ ".inst 0x4f60faa8 // bfdot v8.4s, v21.8h, v0.h[3]\n"
+ ".inst 0x4f61faac // bfdot v12.4s, v21.8h, v1.h[3]\n"
+ ".inst 0x4f62fab0 // bfdot v16.4s, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ ".inst 0x4f60fa89 // bfdot v9.4s, v20.8h, v0.h[3]\n"
+ ".inst 0x4f61fa8d // bfdot v13.4s, v20.8h, v1.h[3]\n"
+ ".inst 0x4f62fa91 // bfdot v17.4s, v20.8h, v2.h[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f60faaa // bfdot v10.4s, v21.8h, v0.h[3]\n"
+ ".inst 0x4f61faae // bfdot v14.4s, v21.8h, v1.h[3]\n"
+ ".inst 0x4f62fab2 // bfdot v18.4s, v21.8h, v2.h[3]\n"
+ ".inst 0x4f60fa8b // bfdot v11.4s, v20.8h, v0.h[3]\n"
+ ".inst 0x4f61fa8f // bfdot v15.4s, v20.8h, v1.h[3]\n"
+ ".inst 0x4f62fa93 // bfdot v19.4s, v20.8h, v2.h[3]\n"
"90:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 94f\n"
"cmp x27, #0x2\n"
"blt 92f\n"
"91:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
"cmp x27, #0x2\n"
- "ldr s2, [x24], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x10, #0x0]\n"
+ ".inst 0x4f58f2a8 // bfdot v8.4s, v21.8h, v24.h[0]\n"
+ ".inst 0x4f57f2ac // bfdot v12.4s, v21.8h, v23.h[0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ ".inst 0x4f56f2b0 // bfdot v16.4s, v21.8h, v22.h[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ ".inst 0x4f58f289 // bfdot v9.4s, v20.8h, v24.h[0]\n"
+ ".inst 0x4f57f28d // bfdot v13.4s, v20.8h, v23.h[0]\n"
+ ".inst 0x4f56f291 // bfdot v17.4s, v20.8h, v22.h[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f58f2aa // bfdot v10.4s, v21.8h, v24.h[0]\n"
+ ".inst 0x4f57f2ae // bfdot v14.4s, v21.8h, v23.h[0]\n"
+ ".inst 0x4f56f2b2 // bfdot v18.4s, v21.8h, v22.h[0]\n"
+ ".inst 0x4f58f28b // bfdot v11.4s, v20.8h, v24.h[0]\n"
+ ".inst 0x4f57f28f // bfdot v15.4s, v20.8h, v23.h[0]\n"
+ ".inst 0x4f56f293 // bfdot v19.4s, v20.8h, v22.h[0]\n"
"bge 91b\n"
"92:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 94f\n"
@@ -1153,23 +1152,23 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr h1, [x25, #0x0]\n"
"ldr h2, [x24, #0x0]\n"
"93:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q21, [x10, #0x0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ ".inst 0x4f40f2a8 // bfdot v8.4s, v21.8h, v0.h[0]\n"
+ ".inst 0x4f41f2ac // bfdot v12.4s, v21.8h, v1.h[0]\n"
+ ".inst 0x4f42f2b0 // bfdot v16.4s, v21.8h, v2.h[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ ".inst 0x4f40f289 // bfdot v9.4s, v20.8h, v0.h[0]\n"
+ ".inst 0x4f41f28d // bfdot v13.4s, v20.8h, v1.h[0]\n"
+ ".inst 0x4f42f291 // bfdot v17.4s, v20.8h, v2.h[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n"
+ ".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n"
+ ".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n"
+ ".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n"
+ ".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n"
+ ".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n"
"94:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1183,33 +1182,33 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 95f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v21.4s\n"
+ "fmin v9.4s, v9.4s, v21.4s\n"
+ "fmin v10.4s, v10.4s, v21.4s\n"
+ "fmin v11.4s, v11.4s, v21.4s\n"
+ "fmin v12.4s, v12.4s, v21.4s\n"
+ "fmin v13.4s, v13.4s, v21.4s\n"
+ "fmin v14.4s, v14.4s, v21.4s\n"
+ "fmin v15.4s, v15.4s, v21.4s\n"
+ "fmin v16.4s, v16.4s, v21.4s\n"
+ "fmin v17.4s, v17.4s, v21.4s\n"
+ "fmin v18.4s, v18.4s, v21.4s\n"
+ "fmin v19.4s, v19.4s, v21.4s\n"
+ "fmax v8.4s, v8.4s, v20.4s\n"
+ "fmax v9.4s, v9.4s, v20.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v20.4s\n"
+ "fmax v14.4s, v14.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v20.4s\n"
+ "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v20.4s\n"
+ "fmax v19.4s, v19.4s, v20.4s\n"
"95:" // Height 3: No activation
"cmp x11, #0x10\n"
"bge 104f\n"
@@ -1485,14 +1484,14 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"120:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 121f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 122f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1502,9 +1501,9 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"b 122f\n"
"121:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"122:" // Height 4: input setup done
"cmp x27, #0x8\n"
"blt 125f\n"
@@ -1523,7 +1522,7 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x26, x26, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x25, x25, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -1531,85 +1530,85 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x23, x23, #0x10\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"cmp x27, #0x10\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n"
+ ".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n"
+ ".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f40f30b // bfdot v11.4s, v24.8h, v0.h[0]\n"
+ ".inst 0x4f41f30f // bfdot v15.4s, v24.8h, v1.h[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f42f313 // bfdot v19.4s, v24.8h, v2.h[0]\n"
+ ".inst 0x4f43f317 // bfdot v23.4s, v24.8h, v3.h[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ ".inst 0x4f60f328 // bfdot v8.4s, v25.8h, v0.h[1]\n"
+ ".inst 0x4f61f32c // bfdot v12.4s, v25.8h, v1.h[1]\n"
+ ".inst 0x4f62f330 // bfdot v16.4s, v25.8h, v2.h[1]\n"
+ ".inst 0x4f63f334 // bfdot v20.4s, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ ".inst 0x4f60f309 // bfdot v9.4s, v24.8h, v0.h[1]\n"
+ ".inst 0x4f61f30d // bfdot v13.4s, v24.8h, v1.h[1]\n"
+ ".inst 0x4f62f311 // bfdot v17.4s, v24.8h, v2.h[1]\n"
+ ".inst 0x4f63f315 // bfdot v21.4s, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ ".inst 0x4f60f32a // bfdot v10.4s, v25.8h, v0.h[1]\n"
+ ".inst 0x4f61f32e // bfdot v14.4s, v25.8h, v1.h[1]\n"
+ ".inst 0x4f62f332 // bfdot v18.4s, v25.8h, v2.h[1]\n"
+ ".inst 0x4f63f336 // bfdot v22.4s, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ ".inst 0x4f60f30b // bfdot v11.4s, v24.8h, v0.h[1]\n"
+ ".inst 0x4f61f30f // bfdot v15.4s, v24.8h, v1.h[1]\n"
+ ".inst 0x4f62f313 // bfdot v19.4s, v24.8h, v2.h[1]\n"
+ ".inst 0x4f63f317 // bfdot v23.4s, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ ".inst 0x4f40fb28 // bfdot v8.4s, v25.8h, v0.h[2]\n"
+ ".inst 0x4f41fb2c // bfdot v12.4s, v25.8h, v1.h[2]\n"
+ ".inst 0x4f42fb30 // bfdot v16.4s, v25.8h, v2.h[2]\n"
+ ".inst 0x4f43fb34 // bfdot v20.4s, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ ".inst 0x4f40fb09 // bfdot v9.4s, v24.8h, v0.h[2]\n"
+ ".inst 0x4f41fb0d // bfdot v13.4s, v24.8h, v1.h[2]\n"
+ ".inst 0x4f42fb11 // bfdot v17.4s, v24.8h, v2.h[2]\n"
+ ".inst 0x4f43fb15 // bfdot v21.4s, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ ".inst 0x4f40fb2a // bfdot v10.4s, v25.8h, v0.h[2]\n"
+ ".inst 0x4f41fb2e // bfdot v14.4s, v25.8h, v1.h[2]\n"
+ ".inst 0x4f42fb32 // bfdot v18.4s, v25.8h, v2.h[2]\n"
+ ".inst 0x4f43fb36 // bfdot v22.4s, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ ".inst 0x4f40fb0b // bfdot v11.4s, v24.8h, v0.h[2]\n"
+ ".inst 0x4f41fb0f // bfdot v15.4s, v24.8h, v1.h[2]\n"
+ ".inst 0x4f42fb13 // bfdot v19.4s, v24.8h, v2.h[2]\n"
+ ".inst 0x4f43fb17 // bfdot v23.4s, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ ".inst 0x4f60fb28 // bfdot v8.4s, v25.8h, v0.h[3]\n"
+ ".inst 0x4f61fb2c // bfdot v12.4s, v25.8h, v1.h[3]\n"
+ ".inst 0x4f62fb30 // bfdot v16.4s, v25.8h, v2.h[3]\n"
+ ".inst 0x4f63fb34 // bfdot v20.4s, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ ".inst 0x4f60fb09 // bfdot v9.4s, v24.8h, v0.h[3]\n"
+ ".inst 0x4f61fb0d // bfdot v13.4s, v24.8h, v1.h[3]\n"
+ ".inst 0x4f62fb11 // bfdot v17.4s, v24.8h, v2.h[3]\n"
+ ".inst 0x4f63fb15 // bfdot v21.4s, v24.8h, v3.h[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f60fb2a // bfdot v10.4s, v25.8h, v0.h[3]\n"
+ ".inst 0x4f61fb2e // bfdot v14.4s, v25.8h, v1.h[3]\n"
+ ".inst 0x4f62fb32 // bfdot v18.4s, v25.8h, v2.h[3]\n"
+ ".inst 0x4f63fb36 // bfdot v22.4s, v25.8h, v3.h[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f60fb0b // bfdot v11.4s, v24.8h, v0.h[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f61fb0f // bfdot v15.4s, v24.8h, v1.h[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f62fb13 // bfdot v19.4s, v24.8h, v2.h[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f63fb17 // bfdot v23.4s, v24.8h, v3.h[3]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 123b\n"
@@ -1620,7 +1619,7 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x25, x25, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x24, x24, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -1628,112 +1627,112 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"sub x27, x27, #0x8\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n"
+ ".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n"
+ ".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f40f30b // bfdot v11.4s, v24.8h, v0.h[0]\n"
+ ".inst 0x4f41f30f // bfdot v15.4s, v24.8h, v1.h[0]\n"
+ ".inst 0x4f42f313 // bfdot v19.4s, v24.8h, v2.h[0]\n"
+ ".inst 0x4f43f317 // bfdot v23.4s, v24.8h, v3.h[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ ".inst 0x4f60f328 // bfdot v8.4s, v25.8h, v0.h[1]\n"
+ ".inst 0x4f61f32c // bfdot v12.4s, v25.8h, v1.h[1]\n"
+ ".inst 0x4f62f330 // bfdot v16.4s, v25.8h, v2.h[1]\n"
+ ".inst 0x4f63f334 // bfdot v20.4s, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ ".inst 0x4f60f309 // bfdot v9.4s, v24.8h, v0.h[1]\n"
+ ".inst 0x4f61f30d // bfdot v13.4s, v24.8h, v1.h[1]\n"
+ ".inst 0x4f62f311 // bfdot v17.4s, v24.8h, v2.h[1]\n"
+ ".inst 0x4f63f315 // bfdot v21.4s, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ ".inst 0x4f60f32a // bfdot v10.4s, v25.8h, v0.h[1]\n"
+ ".inst 0x4f61f32e // bfdot v14.4s, v25.8h, v1.h[1]\n"
+ ".inst 0x4f62f332 // bfdot v18.4s, v25.8h, v2.h[1]\n"
+ ".inst 0x4f63f336 // bfdot v22.4s, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ ".inst 0x4f60f30b // bfdot v11.4s, v24.8h, v0.h[1]\n"
+ ".inst 0x4f61f30f // bfdot v15.4s, v24.8h, v1.h[1]\n"
+ ".inst 0x4f62f313 // bfdot v19.4s, v24.8h, v2.h[1]\n"
+ ".inst 0x4f63f317 // bfdot v23.4s, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ ".inst 0x4f40fb28 // bfdot v8.4s, v25.8h, v0.h[2]\n"
+ ".inst 0x4f41fb2c // bfdot v12.4s, v25.8h, v1.h[2]\n"
+ ".inst 0x4f42fb30 // bfdot v16.4s, v25.8h, v2.h[2]\n"
+ ".inst 0x4f43fb34 // bfdot v20.4s, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ ".inst 0x4f40fb09 // bfdot v9.4s, v24.8h, v0.h[2]\n"
+ ".inst 0x4f41fb0d // bfdot v13.4s, v24.8h, v1.h[2]\n"
+ ".inst 0x4f42fb11 // bfdot v17.4s, v24.8h, v2.h[2]\n"
+ ".inst 0x4f43fb15 // bfdot v21.4s, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ ".inst 0x4f40fb2a // bfdot v10.4s, v25.8h, v0.h[2]\n"
+ ".inst 0x4f41fb2e // bfdot v14.4s, v25.8h, v1.h[2]\n"
+ ".inst 0x4f42fb32 // bfdot v18.4s, v25.8h, v2.h[2]\n"
+ ".inst 0x4f43fb36 // bfdot v22.4s, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ ".inst 0x4f40fb0b // bfdot v11.4s, v24.8h, v0.h[2]\n"
+ ".inst 0x4f41fb0f // bfdot v15.4s, v24.8h, v1.h[2]\n"
+ ".inst 0x4f42fb13 // bfdot v19.4s, v24.8h, v2.h[2]\n"
+ ".inst 0x4f43fb17 // bfdot v23.4s, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ ".inst 0x4f60fb28 // bfdot v8.4s, v25.8h, v0.h[3]\n"
+ ".inst 0x4f61fb2c // bfdot v12.4s, v25.8h, v1.h[3]\n"
+ ".inst 0x4f62fb30 // bfdot v16.4s, v25.8h, v2.h[3]\n"
+ ".inst 0x4f63fb34 // bfdot v20.4s, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ ".inst 0x4f60fb09 // bfdot v9.4s, v24.8h, v0.h[3]\n"
+ ".inst 0x4f61fb0d // bfdot v13.4s, v24.8h, v1.h[3]\n"
+ ".inst 0x4f62fb11 // bfdot v17.4s, v24.8h, v2.h[3]\n"
+ ".inst 0x4f63fb15 // bfdot v21.4s, v24.8h, v3.h[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
- ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f60fb2a // bfdot v10.4s, v25.8h, v0.h[3]\n"
+ ".inst 0x4f61fb2e // bfdot v14.4s, v25.8h, v1.h[3]\n"
+ ".inst 0x4f62fb32 // bfdot v18.4s, v25.8h, v2.h[3]\n"
+ ".inst 0x4f63fb36 // bfdot v22.4s, v25.8h, v3.h[3]\n"
+ ".inst 0x4f60fb0b // bfdot v11.4s, v24.8h, v0.h[3]\n"
+ ".inst 0x4f61fb0f // bfdot v15.4s, v24.8h, v1.h[3]\n"
+ ".inst 0x4f62fb13 // bfdot v19.4s, v24.8h, v2.h[3]\n"
+ ".inst 0x4f63fb17 // bfdot v23.4s, v24.8h, v3.h[3]\n"
"125:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 129f\n"
"cmp x27, #0x2\n"
"blt 127f\n"
"126:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
"cmp x27, #0x2\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ ".inst 0x4f5df328 // bfdot v8.4s, v25.8h, v29.h[0]\n"
+ ".inst 0x4f5cf32c // bfdot v12.4s, v25.8h, v28.h[0]\n"
+ ".inst 0x4f5bf330 // bfdot v16.4s, v25.8h, v27.h[0]\n"
+ ".inst 0x4f5af334 // bfdot v20.4s, v25.8h, v26.h[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ ".inst 0x4f5df309 // bfdot v9.4s, v24.8h, v29.h[0]\n"
+ ".inst 0x4f5cf30d // bfdot v13.4s, v24.8h, v28.h[0]\n"
+ ".inst 0x4f5bf311 // bfdot v17.4s, v24.8h, v27.h[0]\n"
+ ".inst 0x4f5af315 // bfdot v21.4s, v24.8h, v26.h[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f5df32a // bfdot v10.4s, v25.8h, v29.h[0]\n"
+ ".inst 0x4f5cf32e // bfdot v14.4s, v25.8h, v28.h[0]\n"
+ ".inst 0x4f5bf332 // bfdot v18.4s, v25.8h, v27.h[0]\n"
+ ".inst 0x4f5af336 // bfdot v22.4s, v25.8h, v26.h[0]\n"
+ ".inst 0x4f5df30b // bfdot v11.4s, v24.8h, v29.h[0]\n"
+ ".inst 0x4f5cf30f // bfdot v15.4s, v24.8h, v28.h[0]\n"
+ ".inst 0x4f5bf313 // bfdot v19.4s, v24.8h, v27.h[0]\n"
+ ".inst 0x4f5af317 // bfdot v23.4s, v24.8h, v26.h[0]\n"
"bge 126b\n"
"127:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 129f\n"
@@ -1742,27 +1741,27 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr h2, [x24, #0x0]\n"
"ldr h3, [x23, #0x0]\n"
"128:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ ".inst 0x4f40f328 // bfdot v8.4s, v25.8h, v0.h[0]\n"
+ ".inst 0x4f41f32c // bfdot v12.4s, v25.8h, v1.h[0]\n"
+ ".inst 0x4f42f330 // bfdot v16.4s, v25.8h, v2.h[0]\n"
+ ".inst 0x4f43f334 // bfdot v20.4s, v25.8h, v3.h[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ ".inst 0x4f40f309 // bfdot v9.4s, v24.8h, v0.h[0]\n"
+ ".inst 0x4f41f30d // bfdot v13.4s, v24.8h, v1.h[0]\n"
+ ".inst 0x4f42f311 // bfdot v17.4s, v24.8h, v2.h[0]\n"
+ ".inst 0x4f43f315 // bfdot v21.4s, v24.8h, v3.h[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n"
+ ".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n"
+ ".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n"
+ ".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n"
+ ".inst 0x4f40f30b // bfdot v11.4s, v24.8h, v0.h[0]\n"
+ ".inst 0x4f41f30f // bfdot v15.4s, v24.8h, v1.h[0]\n"
+ ".inst 0x4f42f313 // bfdot v19.4s, v24.8h, v2.h[0]\n"
+ ".inst 0x4f43f317 // bfdot v23.4s, v24.8h, v3.h[0]\n"
"129:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1778,41 +1777,41 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 130f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v25.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v25.4s\n"
+ "fmin v9.4s, v9.4s, v25.4s\n"
+ "fmin v10.4s, v10.4s, v25.4s\n"
+ "fmin v11.4s, v11.4s, v25.4s\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmin v20.4s, v20.4s, v25.4s\n"
+ "fmin v21.4s, v21.4s, v25.4s\n"
+ "fmin v22.4s, v22.4s, v25.4s\n"
+ "fmin v23.4s, v23.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v24.4s\n"
+ "fmax v9.4s, v9.4s, v24.4s\n"
+ "fmax v10.4s, v10.4s, v24.4s\n"
+ "fmax v11.4s, v11.4s, v24.4s\n"
+ "fmax v12.4s, v12.4s, v24.4s\n"
+ "fmax v13.4s, v13.4s, v24.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v23.4s, v23.4s, v24.4s\n"
"130:" // Height 4: No activation
"cmp x11, #0x10\n"
"bge 139f\n"
@@ -2137,15 +2136,15 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"155:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 156f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 157f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2156,10 +2155,10 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"b 157f\n"
"156:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"157:" // Height 5: input setup done
"cmp x27, #0x8\n"
"blt 160f\n"
@@ -2182,7 +2181,7 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
"add x23, x23, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -2191,100 +2190,100 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x27, #0x10\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n"
+ ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f42f3b2 // bfdot v18.4s, v29.8h, v2.h[0]\n"
+ ".inst 0x4f43f3b6 // bfdot v22.4s, v29.8h, v3.h[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
- ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
- ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
- ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
- ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
- ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
- ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
- ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
- ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
- ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
- ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f44f3ba // bfdot v26.4s, v29.8h, v4.h[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ ".inst 0x4f40f38b // bfdot v11.4s, v28.8h, v0.h[0]\n"
+ ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n"
+ ".inst 0x4f42f393 // bfdot v19.4s, v28.8h, v2.h[0]\n"
+ ".inst 0x4f43f397 // bfdot v23.4s, v28.8h, v3.h[0]\n"
+ ".inst 0x4f44f39b // bfdot v27.4s, v28.8h, v4.h[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ ".inst 0x4f60f3a8 // bfdot v8.4s, v29.8h, v0.h[1]\n"
+ ".inst 0x4f61f3ac // bfdot v12.4s, v29.8h, v1.h[1]\n"
+ ".inst 0x4f62f3b0 // bfdot v16.4s, v29.8h, v2.h[1]\n"
+ ".inst 0x4f63f3b4 // bfdot v20.4s, v29.8h, v3.h[1]\n"
+ ".inst 0x4f64f3b8 // bfdot v24.4s, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ ".inst 0x4f60f389 // bfdot v9.4s, v28.8h, v0.h[1]\n"
+ ".inst 0x4f61f38d // bfdot v13.4s, v28.8h, v1.h[1]\n"
+ ".inst 0x4f62f391 // bfdot v17.4s, v28.8h, v2.h[1]\n"
+ ".inst 0x4f63f395 // bfdot v21.4s, v28.8h, v3.h[1]\n"
+ ".inst 0x4f64f399 // bfdot v25.4s, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ ".inst 0x4f60f3aa // bfdot v10.4s, v29.8h, v0.h[1]\n"
+ ".inst 0x4f61f3ae // bfdot v14.4s, v29.8h, v1.h[1]\n"
+ ".inst 0x4f62f3b2 // bfdot v18.4s, v29.8h, v2.h[1]\n"
+ ".inst 0x4f63f3b6 // bfdot v22.4s, v29.8h, v3.h[1]\n"
+ ".inst 0x4f64f3ba // bfdot v26.4s, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ ".inst 0x4f60f38b // bfdot v11.4s, v28.8h, v0.h[1]\n"
+ ".inst 0x4f61f38f // bfdot v15.4s, v28.8h, v1.h[1]\n"
+ ".inst 0x4f62f393 // bfdot v19.4s, v28.8h, v2.h[1]\n"
+ ".inst 0x4f63f397 // bfdot v23.4s, v28.8h, v3.h[1]\n"
+ ".inst 0x4f64f39b // bfdot v27.4s, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ ".inst 0x4f40fba8 // bfdot v8.4s, v29.8h, v0.h[2]\n"
+ ".inst 0x4f41fbac // bfdot v12.4s, v29.8h, v1.h[2]\n"
+ ".inst 0x4f42fbb0 // bfdot v16.4s, v29.8h, v2.h[2]\n"
+ ".inst 0x4f43fbb4 // bfdot v20.4s, v29.8h, v3.h[2]\n"
+ ".inst 0x4f44fbb8 // bfdot v24.4s, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ ".inst 0x4f40fb89 // bfdot v9.4s, v28.8h, v0.h[2]\n"
+ ".inst 0x4f41fb8d // bfdot v13.4s, v28.8h, v1.h[2]\n"
+ ".inst 0x4f42fb91 // bfdot v17.4s, v28.8h, v2.h[2]\n"
+ ".inst 0x4f43fb95 // bfdot v21.4s, v28.8h, v3.h[2]\n"
+ ".inst 0x4f44fb99 // bfdot v25.4s, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ ".inst 0x4f40fbaa // bfdot v10.4s, v29.8h, v0.h[2]\n"
+ ".inst 0x4f41fbae // bfdot v14.4s, v29.8h, v1.h[2]\n"
+ ".inst 0x4f42fbb2 // bfdot v18.4s, v29.8h, v2.h[2]\n"
+ ".inst 0x4f43fbb6 // bfdot v22.4s, v29.8h, v3.h[2]\n"
+ ".inst 0x4f44fbba // bfdot v26.4s, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ ".inst 0x4f40fb8b // bfdot v11.4s, v28.8h, v0.h[2]\n"
+ ".inst 0x4f41fb8f // bfdot v15.4s, v28.8h, v1.h[2]\n"
+ ".inst 0x4f42fb93 // bfdot v19.4s, v28.8h, v2.h[2]\n"
+ ".inst 0x4f43fb97 // bfdot v23.4s, v28.8h, v3.h[2]\n"
+ ".inst 0x4f44fb9b // bfdot v27.4s, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ ".inst 0x4f60fba8 // bfdot v8.4s, v29.8h, v0.h[3]\n"
+ ".inst 0x4f61fbac // bfdot v12.4s, v29.8h, v1.h[3]\n"
+ ".inst 0x4f62fbb0 // bfdot v16.4s, v29.8h, v2.h[3]\n"
+ ".inst 0x4f63fbb4 // bfdot v20.4s, v29.8h, v3.h[3]\n"
+ ".inst 0x4f64fbb8 // bfdot v24.4s, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ ".inst 0x4f60fb89 // bfdot v9.4s, v28.8h, v0.h[3]\n"
+ ".inst 0x4f61fb8d // bfdot v13.4s, v28.8h, v1.h[3]\n"
+ ".inst 0x4f62fb91 // bfdot v17.4s, v28.8h, v2.h[3]\n"
+ ".inst 0x4f63fb95 // bfdot v21.4s, v28.8h, v3.h[3]\n"
+ ".inst 0x4f64fb99 // bfdot v25.4s, v28.8h, v4.h[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
- ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f60fbaa // bfdot v10.4s, v29.8h, v0.h[3]\n"
+ ".inst 0x4f61fbae // bfdot v14.4s, v29.8h, v1.h[3]\n"
+ ".inst 0x4f62fbb2 // bfdot v18.4s, v29.8h, v2.h[3]\n"
+ ".inst 0x4f63fbb6 // bfdot v22.4s, v29.8h, v3.h[3]\n"
+ ".inst 0x4f64fbba // bfdot v26.4s, v29.8h, v4.h[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f60fb8b // bfdot v11.4s, v28.8h, v0.h[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f61fb8f // bfdot v15.4s, v28.8h, v1.h[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f62fb93 // bfdot v19.4s, v28.8h, v2.h[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f63fb97 // bfdot v23.4s, v28.8h, v3.h[3]\n"
"ldr q3, [x23, #0x0]\n"
- ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f64fb9b // bfdot v27.4s, v28.8h, v4.h[3]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 158b\n"
@@ -2298,7 +2297,7 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
"add x22, x22, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -2307,131 +2306,131 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n"
+ ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f42f3b2 // bfdot v18.4s, v29.8h, v2.h[0]\n"
+ ".inst 0x4f43f3b6 // bfdot v22.4s, v29.8h, v3.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
- ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
- ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
- ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
- ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
- ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
- ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
- ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
- ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
- ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
- ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f44f3ba // bfdot v26.4s, v29.8h, v4.h[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ ".inst 0x4f40f38b // bfdot v11.4s, v28.8h, v0.h[0]\n"
+ ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n"
+ ".inst 0x4f42f393 // bfdot v19.4s, v28.8h, v2.h[0]\n"
+ ".inst 0x4f43f397 // bfdot v23.4s, v28.8h, v3.h[0]\n"
+ ".inst 0x4f44f39b // bfdot v27.4s, v28.8h, v4.h[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ ".inst 0x4f60f3a8 // bfdot v8.4s, v29.8h, v0.h[1]\n"
+ ".inst 0x4f61f3ac // bfdot v12.4s, v29.8h, v1.h[1]\n"
+ ".inst 0x4f62f3b0 // bfdot v16.4s, v29.8h, v2.h[1]\n"
+ ".inst 0x4f63f3b4 // bfdot v20.4s, v29.8h, v3.h[1]\n"
+ ".inst 0x4f64f3b8 // bfdot v24.4s, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ ".inst 0x4f60f389 // bfdot v9.4s, v28.8h, v0.h[1]\n"
+ ".inst 0x4f61f38d // bfdot v13.4s, v28.8h, v1.h[1]\n"
+ ".inst 0x4f62f391 // bfdot v17.4s, v28.8h, v2.h[1]\n"
+ ".inst 0x4f63f395 // bfdot v21.4s, v28.8h, v3.h[1]\n"
+ ".inst 0x4f64f399 // bfdot v25.4s, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ ".inst 0x4f60f3aa // bfdot v10.4s, v29.8h, v0.h[1]\n"
+ ".inst 0x4f61f3ae // bfdot v14.4s, v29.8h, v1.h[1]\n"
+ ".inst 0x4f62f3b2 // bfdot v18.4s, v29.8h, v2.h[1]\n"
+ ".inst 0x4f63f3b6 // bfdot v22.4s, v29.8h, v3.h[1]\n"
+ ".inst 0x4f64f3ba // bfdot v26.4s, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ ".inst 0x4f60f38b // bfdot v11.4s, v28.8h, v0.h[1]\n"
+ ".inst 0x4f61f38f // bfdot v15.4s, v28.8h, v1.h[1]\n"
+ ".inst 0x4f62f393 // bfdot v19.4s, v28.8h, v2.h[1]\n"
+ ".inst 0x4f63f397 // bfdot v23.4s, v28.8h, v3.h[1]\n"
+ ".inst 0x4f64f39b // bfdot v27.4s, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ ".inst 0x4f40fba8 // bfdot v8.4s, v29.8h, v0.h[2]\n"
+ ".inst 0x4f41fbac // bfdot v12.4s, v29.8h, v1.h[2]\n"
+ ".inst 0x4f42fbb0 // bfdot v16.4s, v29.8h, v2.h[2]\n"
+ ".inst 0x4f43fbb4 // bfdot v20.4s, v29.8h, v3.h[2]\n"
+ ".inst 0x4f44fbb8 // bfdot v24.4s, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ ".inst 0x4f40fb89 // bfdot v9.4s, v28.8h, v0.h[2]\n"
+ ".inst 0x4f41fb8d // bfdot v13.4s, v28.8h, v1.h[2]\n"
+ ".inst 0x4f42fb91 // bfdot v17.4s, v28.8h, v2.h[2]\n"
+ ".inst 0x4f43fb95 // bfdot v21.4s, v28.8h, v3.h[2]\n"
+ ".inst 0x4f44fb99 // bfdot v25.4s, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ ".inst 0x4f40fbaa // bfdot v10.4s, v29.8h, v0.h[2]\n"
+ ".inst 0x4f41fbae // bfdot v14.4s, v29.8h, v1.h[2]\n"
+ ".inst 0x4f42fbb2 // bfdot v18.4s, v29.8h, v2.h[2]\n"
+ ".inst 0x4f43fbb6 // bfdot v22.4s, v29.8h, v3.h[2]\n"
+ ".inst 0x4f44fbba // bfdot v26.4s, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ ".inst 0x4f40fb8b // bfdot v11.4s, v28.8h, v0.h[2]\n"
+ ".inst 0x4f41fb8f // bfdot v15.4s, v28.8h, v1.h[2]\n"
+ ".inst 0x4f42fb93 // bfdot v19.4s, v28.8h, v2.h[2]\n"
+ ".inst 0x4f43fb97 // bfdot v23.4s, v28.8h, v3.h[2]\n"
+ ".inst 0x4f44fb9b // bfdot v27.4s, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ ".inst 0x4f60fba8 // bfdot v8.4s, v29.8h, v0.h[3]\n"
+ ".inst 0x4f61fbac // bfdot v12.4s, v29.8h, v1.h[3]\n"
+ ".inst 0x4f62fbb0 // bfdot v16.4s, v29.8h, v2.h[3]\n"
+ ".inst 0x4f63fbb4 // bfdot v20.4s, v29.8h, v3.h[3]\n"
+ ".inst 0x4f64fbb8 // bfdot v24.4s, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ ".inst 0x4f60fb89 // bfdot v9.4s, v28.8h, v0.h[3]\n"
+ ".inst 0x4f61fb8d // bfdot v13.4s, v28.8h, v1.h[3]\n"
+ ".inst 0x4f62fb91 // bfdot v17.4s, v28.8h, v2.h[3]\n"
+ ".inst 0x4f63fb95 // bfdot v21.4s, v28.8h, v3.h[3]\n"
+ ".inst 0x4f64fb99 // bfdot v25.4s, v28.8h, v4.h[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
- ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
- ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
- ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
- ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
- ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f60fbaa // bfdot v10.4s, v29.8h, v0.h[3]\n"
+ ".inst 0x4f61fbae // bfdot v14.4s, v29.8h, v1.h[3]\n"
+ ".inst 0x4f62fbb2 // bfdot v18.4s, v29.8h, v2.h[3]\n"
+ ".inst 0x4f63fbb6 // bfdot v22.4s, v29.8h, v3.h[3]\n"
+ ".inst 0x4f64fbba // bfdot v26.4s, v29.8h, v4.h[3]\n"
+ ".inst 0x4f60fb8b // bfdot v11.4s, v28.8h, v0.h[3]\n"
+ ".inst 0x4f61fb8f // bfdot v15.4s, v28.8h, v1.h[3]\n"
+ ".inst 0x4f62fb93 // bfdot v19.4s, v28.8h, v2.h[3]\n"
+ ".inst 0x4f63fb97 // bfdot v23.4s, v28.8h, v3.h[3]\n"
+ ".inst 0x4f64fb9b // bfdot v27.4s, v28.8h, v4.h[3]\n"
"160:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 164f\n"
"cmp x27, #0x2\n"
"blt 162f\n"
"161:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
"cmp x27, #0x2\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s0, [x24], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q29, [x10, #0x0]\n"
+ ".inst 0x4f42f3a8 // bfdot v8.4s, v29.8h, v2.h[0]\n"
+ ".inst 0x4f41f3ac // bfdot v12.4s, v29.8h, v1.h[0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ ".inst 0x4f40f3b0 // bfdot v16.4s, v29.8h, v0.h[0]\n"
+ ".inst 0x4f5ff3b4 // bfdot v20.4s, v29.8h, v31.h[0]\n"
+ ".inst 0x4f5ef3b8 // bfdot v24.4s, v29.8h, v30.h[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ ".inst 0x4f42f389 // bfdot v9.4s, v28.8h, v2.h[0]\n"
+ ".inst 0x4f41f38d // bfdot v13.4s, v28.8h, v1.h[0]\n"
+ ".inst 0x4f40f391 // bfdot v17.4s, v28.8h, v0.h[0]\n"
+ ".inst 0x4f5ff395 // bfdot v21.4s, v28.8h, v31.h[0]\n"
+ ".inst 0x4f5ef399 // bfdot v25.4s, v28.8h, v30.h[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f42f3aa // bfdot v10.4s, v29.8h, v2.h[0]\n"
+ ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n"
+ ".inst 0x4f40f3b2 // bfdot v18.4s, v29.8h, v0.h[0]\n"
+ ".inst 0x4f5ff3b6 // bfdot v22.4s, v29.8h, v31.h[0]\n"
+ ".inst 0x4f5ef3ba // bfdot v26.4s, v29.8h, v30.h[0]\n"
+ ".inst 0x4f42f38b // bfdot v11.4s, v28.8h, v2.h[0]\n"
+ ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n"
+ ".inst 0x4f40f393 // bfdot v19.4s, v28.8h, v0.h[0]\n"
+ ".inst 0x4f5ff397 // bfdot v23.4s, v28.8h, v31.h[0]\n"
+ ".inst 0x4f5ef39b // bfdot v27.4s, v28.8h, v30.h[0]\n"
"bge 161b\n"
"162:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 164f\n"
@@ -2441,31 +2440,31 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr h3, [x23, #0x0]\n"
"ldr h4, [x22, #0x0]\n"
"163:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q29, [x10, #0x0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ ".inst 0x4f40f3a8 // bfdot v8.4s, v29.8h, v0.h[0]\n"
+ ".inst 0x4f41f3ac // bfdot v12.4s, v29.8h, v1.h[0]\n"
+ ".inst 0x4f42f3b0 // bfdot v16.4s, v29.8h, v2.h[0]\n"
+ ".inst 0x4f43f3b4 // bfdot v20.4s, v29.8h, v3.h[0]\n"
+ ".inst 0x4f44f3b8 // bfdot v24.4s, v29.8h, v4.h[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ ".inst 0x4f40f389 // bfdot v9.4s, v28.8h, v0.h[0]\n"
+ ".inst 0x4f41f38d // bfdot v13.4s, v28.8h, v1.h[0]\n"
+ ".inst 0x4f42f391 // bfdot v17.4s, v28.8h, v2.h[0]\n"
+ ".inst 0x4f43f395 // bfdot v21.4s, v28.8h, v3.h[0]\n"
+ ".inst 0x4f44f399 // bfdot v25.4s, v28.8h, v4.h[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n"
+ ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n"
+ ".inst 0x4f42f3b2 // bfdot v18.4s, v29.8h, v2.h[0]\n"
+ ".inst 0x4f43f3b6 // bfdot v22.4s, v29.8h, v3.h[0]\n"
+ ".inst 0x4f44f3ba // bfdot v26.4s, v29.8h, v4.h[0]\n"
+ ".inst 0x4f40f38b // bfdot v11.4s, v28.8h, v0.h[0]\n"
+ ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n"
+ ".inst 0x4f42f393 // bfdot v19.4s, v28.8h, v2.h[0]\n"
+ ".inst 0x4f43f397 // bfdot v23.4s, v28.8h, v3.h[0]\n"
+ ".inst 0x4f44f39b // bfdot v27.4s, v28.8h, v4.h[0]\n"
"164:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2483,49 +2482,49 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 165f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v29.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmin v24.4s, v24.4s, v1.4s\n"
- "fmin v25.4s, v25.4s, v1.4s\n"
- "fmin v26.4s, v26.4s, v1.4s\n"
- "fmin v27.4s, v27.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v0.4s\n"
- "fmax v26.4s, v26.4s, v0.4s\n"
- "fmax v27.4s, v27.4s, v0.4s\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "fmin v9.4s, v9.4s, v29.4s\n"
+ "fmin v10.4s, v10.4s, v29.4s\n"
+ "fmin v11.4s, v11.4s, v29.4s\n"
+ "fmin v12.4s, v12.4s, v29.4s\n"
+ "fmin v13.4s, v13.4s, v29.4s\n"
+ "fmin v14.4s, v14.4s, v29.4s\n"
+ "fmin v15.4s, v15.4s, v29.4s\n"
+ "fmin v16.4s, v16.4s, v29.4s\n"
+ "fmin v17.4s, v17.4s, v29.4s\n"
+ "fmin v18.4s, v18.4s, v29.4s\n"
+ "fmin v19.4s, v19.4s, v29.4s\n"
+ "fmin v20.4s, v20.4s, v29.4s\n"
+ "fmin v21.4s, v21.4s, v29.4s\n"
+ "fmin v22.4s, v22.4s, v29.4s\n"
+ "fmin v23.4s, v23.4s, v29.4s\n"
+ "fmin v24.4s, v24.4s, v29.4s\n"
+ "fmin v25.4s, v25.4s, v29.4s\n"
+ "fmin v26.4s, v26.4s, v29.4s\n"
+ "fmin v27.4s, v27.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v28.4s\n"
+ "fmax v9.4s, v9.4s, v28.4s\n"
+ "fmax v10.4s, v10.4s, v28.4s\n"
+ "fmax v11.4s, v11.4s, v28.4s\n"
+ "fmax v12.4s, v12.4s, v28.4s\n"
+ "fmax v13.4s, v13.4s, v28.4s\n"
+ "fmax v14.4s, v14.4s, v28.4s\n"
+ "fmax v15.4s, v15.4s, v28.4s\n"
+ "fmax v16.4s, v16.4s, v28.4s\n"
+ "fmax v17.4s, v17.4s, v28.4s\n"
+ "fmax v18.4s, v18.4s, v28.4s\n"
+ "fmax v19.4s, v19.4s, v28.4s\n"
+ "fmax v20.4s, v20.4s, v28.4s\n"
+ "fmax v21.4s, v21.4s, v28.4s\n"
+ "fmax v22.4s, v22.4s, v28.4s\n"
+ "fmax v23.4s, v23.4s, v28.4s\n"
+ "fmax v24.4s, v24.4s, v28.4s\n"
+ "fmax v25.4s, v25.4s, v28.4s\n"
+ "fmax v26.4s, v26.4s, v28.4s\n"
+ "fmax v27.4s, v27.4s, v28.4s\n"
"165:" // Height 5: No activation
"cmp x11, #0x10\n"
"bge 174f\n"
@@ -2902,16 +2901,16 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"190:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 191f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 192f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2923,11 +2922,11 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"b 192f\n"
"191:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"192:" // Height 6: input setup done
"cmp x27, #0x8\n"
"blt 195f\n"
@@ -3206,43 +3205,43 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"cmp x27, #0x2\n"
"blt 197f\n"
"196:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x2\n"
"cmp x27, #0x2\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x4f47f028 // bfdot v8.4s, v1.8h, v7.h[0]\n"
+ ".inst 0x4f46f02c // bfdot v12.4s, v1.8h, v6.h[0]\n"
+ ".inst 0x4f45f030 // bfdot v16.4s, v1.8h, v5.h[0]\n"
+ ".inst 0x4f44f034 // bfdot v20.4s, v1.8h, v4.h[0]\n"
+ ".inst 0x4f43f038 // bfdot v24.4s, v1.8h, v3.h[0]\n"
+ ".inst 0x4f42f03c // bfdot v28.4s, v1.8h, v2.h[0]\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x4f47f009 // bfdot v9.4s, v0.8h, v7.h[0]\n"
+ ".inst 0x4f46f00d // bfdot v13.4s, v0.8h, v6.h[0]\n"
+ ".inst 0x4f45f011 // bfdot v17.4s, v0.8h, v5.h[0]\n"
+ ".inst 0x4f44f015 // bfdot v21.4s, v0.8h, v4.h[0]\n"
+ ".inst 0x4f43f019 // bfdot v25.4s, v0.8h, v3.h[0]\n"
+ ".inst 0x4f42f01d // bfdot v29.4s, v0.8h, v2.h[0]\n"
+ "ldr q0, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ ".inst 0x4f47f02a // bfdot v10.4s, v1.8h, v7.h[0]\n"
+ ".inst 0x4f46f02e // bfdot v14.4s, v1.8h, v6.h[0]\n"
+ ".inst 0x4f45f032 // bfdot v18.4s, v1.8h, v5.h[0]\n"
+ ".inst 0x4f44f036 // bfdot v22.4s, v1.8h, v4.h[0]\n"
+ ".inst 0x4f43f03a // bfdot v26.4s, v1.8h, v3.h[0]\n"
+ ".inst 0x4f42f03e // bfdot v30.4s, v1.8h, v2.h[0]\n"
+ ".inst 0x4f47f00b // bfdot v11.4s, v0.8h, v7.h[0]\n"
+ ".inst 0x4f46f00f // bfdot v15.4s, v0.8h, v6.h[0]\n"
+ ".inst 0x4f45f013 // bfdot v19.4s, v0.8h, v5.h[0]\n"
+ ".inst 0x4f44f017 // bfdot v23.4s, v0.8h, v4.h[0]\n"
+ ".inst 0x4f43f01b // bfdot v27.4s, v0.8h, v3.h[0]\n"
+ ".inst 0x4f42f01f // bfdot v31.4s, v0.8h, v2.h[0]\n"
"bge 196b\n"
"197:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 199f\n"
@@ -3253,35 +3252,35 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"ldr h4, [x22, #0x0]\n"
"ldr h5, [x21, #0x0]\n"
"198:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q7, [x10, #0x0]\n"
+ "ldr q6, [x10, #0x10]\n"
+ ".inst 0x4f40f0e8 // bfdot v8.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ec // bfdot v12.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f0 // bfdot v16.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f4 // bfdot v20.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f8 // bfdot v24.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fc // bfdot v28.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x10, #0x20]\n"
+ ".inst 0x4f40f0c9 // bfdot v9.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0cd // bfdot v13.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d1 // bfdot v17.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d5 // bfdot v21.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d9 // bfdot v25.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0dd // bfdot v29.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
- ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ ".inst 0x4f40f0ea // bfdot v10.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ee // bfdot v14.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f2 // bfdot v18.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f6 // bfdot v22.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fa // bfdot v26.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fe // bfdot v30.4s, v7.8h, v5.h[0]\n"
+ ".inst 0x4f40f0cb // bfdot v11.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0cf // bfdot v15.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d3 // bfdot v19.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d7 // bfdot v23.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0db // bfdot v27.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0df // bfdot v31.4s, v6.8h, v5.h[0]\n"
"199:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3522,7 +3521,6 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"212:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
index 8cb743b777..d9e7259fa2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
index 5a000c69af..f6389e27d1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -93,7 +93,6 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 186f\n"
@@ -211,11 +210,11 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"16:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -231,41 +230,41 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 20f\n"
"19:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ "trn1 v20.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e47ee88 // bfmmla v8.4s, v20.8h, v7.8h\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6e46ee8c // bfmmla v12.4s, v20.8h, v6.8h\n"
+ "ldr q19, [x10, #0x30]\n"
+ ".inst 0x6e51ee89 // bfmmla v9.4s, v20.8h, v17.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e53ee8d // bfmmla v13.4s, v20.8h, v19.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ee8a // bfmmla v10.4s, v20.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ee8e // bfmmla v14.4s, v20.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e52ee8b // bfmmla v11.4s, v20.8h, v18.8h\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x6e51ee8f // bfmmla v15.4s, v20.8h, v17.8h\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xf0]\n"
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
"add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
@@ -273,40 +272,40 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
"bge 19b\n"
"20:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ "trn1 v19.2d, v1.2d, v20.2d\n"
+ ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
+ "ldr q18, [x10, #0x30]\n"
+ ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x50]\n"
+ ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x6e52ee6e // bfmmla v14.4s, v19.8h, v18.8h\n"
+ "ldr q24, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
+ ".inst 0x6e51ee6b // bfmmla v11.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x6e58ee6f // bfmmla v15.4s, v19.8h, v24.8h\n"
+ "ldr q2, [x10, #0x90]\n"
+ ".inst 0x6e51ec28 // bfmmla v8.4s, v1.8h, v17.8h\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"21:" // Height 1: Multiply loop: Main loop skip
@@ -314,26 +313,26 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 23f\n"
"22:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr d19, [x26], #0x8\n"
+ "ldr q18, [x10, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x10, x10, #0x80\n"
"bge 22b\n"
"23:" // Height 1: Multiply loop: Skip odd blocks
@@ -346,23 +345,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr h1, [x26, #0x0]\n"
"25:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ "ldr q20, [x10, #0x0]\n"
+ "ldr q18, [x10, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v17.2d\n"
+ ".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6e52ee6c // bfmmla v12.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x30]\n"
+ ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n"
+ "ldr q2, [x10, #0x50]\n"
+ ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e42ee6e // bfmmla v14.4s, v19.8h, v2.8h\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x10, x10, #0x80\n"
"26:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -376,17 +375,17 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp1 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v18.4s\n"
+ "fmin v9.4s, v9.4s, v18.4s\n"
+ "fmin v10.4s, v10.4s, v18.4s\n"
+ "fmin v11.4s, v11.4s, v18.4s\n"
+ "fmax v8.4s, v8.4s, v17.4s\n"
+ "fmax v9.4s, v9.4s, v17.4s\n"
+ "fmax v10.4s, v10.4s, v17.4s\n"
+ "fmax v11.4s, v11.4s, v17.4s\n"
"27:" // Height 1: No activation
"cmp x11, #0x10\n"
"bge 36f\n"
@@ -577,12 +576,12 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"53:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 54f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 55f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -590,7 +589,7 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"b 55f\n"
"54:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"55:" // Height 2: input setup done
"cmp x27, #0x8\n"
"blt 58f\n"
@@ -601,85 +600,85 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 57f\n"
"56:" // Height 2: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xf0]\n"
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"ldr q2, [x25, #0x0]\n"
"cmp x27, #0x10\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
"add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"bge 56b\n"
"57:" // Height 2: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n"
+ "ldr q17, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n"
+ ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n"
"sub x27, x27, #0x8\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
@@ -689,27 +688,27 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"cmp x27, #0x4\n"
"blt 60f\n"
"59:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q6, [x10, #0x20]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q6, [x10, #0x40]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q6, [x10, #0x60]\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ "ldr q5, [x10, #0x30]\n"
+ ".inst 0x6e5aee69 // bfmmla v9.4s, v19.8h, v26.8h\n"
+ ".inst 0x6e45ee6d // bfmmla v13.4s, v19.8h, v5.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ "ldr q17, [x10, #0x70]\n"
"cmp x27, #0x4\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x10, x10, #0x80\n"
"bge 59b\n"
"60:" // Height 2: Multiply loop: Skip odd blocks
@@ -725,23 +724,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr h1, [x26, #0x0]\n"
"ldr h2, [x25, #0x0]\n"
"62:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n"
+ "ldr q30, [x10, #0x40]\n"
+ ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n"
+ "ldr q26, [x10, #0x50]\n"
+ ".inst 0x6e5eee6a // bfmmla v10.4s, v19.8h, v30.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e5aee6e // bfmmla v14.4s, v19.8h, v26.8h\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n"
+ ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n"
"add x10, x10, #0x80\n"
"63:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -762,25 +761,25 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp2 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v7.4s, v7.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v7.4s, v7.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "fmin v7.4s, v7.4s, v18.4s\n"
+ "fmin v12.4s, v12.4s, v18.4s\n"
+ "fmin v13.4s, v13.4s, v18.4s\n"
+ "fmin v14.4s, v14.4s, v18.4s\n"
+ "fmin v8.4s, v8.4s, v18.4s\n"
+ "fmin v9.4s, v9.4s, v18.4s\n"
+ "fmin v10.4s, v10.4s, v18.4s\n"
+ "fmin v11.4s, v11.4s, v18.4s\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "fmax v12.4s, v12.4s, v17.4s\n"
+ "fmax v13.4s, v13.4s, v17.4s\n"
+ "fmax v14.4s, v14.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v17.4s\n"
+ "fmax v9.4s, v9.4s, v17.4s\n"
+ "fmax v10.4s, v10.4s, v17.4s\n"
+ "fmax v11.4s, v11.4s, v17.4s\n"
"64:" // Height 2: No activation
"cmp x11, #0x10\n"
"bge 73f\n"
@@ -1036,13 +1035,13 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"90:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 91f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 92f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1051,8 +1050,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"b 92f\n"
"91:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"92:" // Height 3: input setup done
"cmp x27, #0x8\n"
"blt 95f\n"
@@ -1064,167 +1063,167 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 94f\n"
"93:" // Height 3: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
"cmp x27, #0x10\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"bge 93b\n"
"94:" // Height 3: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x90]\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"95:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 100f\n"
"cmp x27, #0x4\n"
"blt 97f\n"
"96:" // Height 3: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr q26, [x10, #0x0]\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
"sub x27, x27, #0x4\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"cmp x27, #0x4\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"bge 96b\n"
"97:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 100f\n"
@@ -1242,33 +1241,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr h2, [x25, #0x0]\n"
"ldr h3, [x24, #0x0]\n"
"99:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v25.2d\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n"
+ ".inst 0x6e5def74 // bfmmla v20.4s, v27.8h, v29.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"100:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1294,33 +1293,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp1 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 101f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v7.4s, v7.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v7.4s, v7.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "fmin v7.4s, v7.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v25.4s\n"
+ "fmax v14.4s, v14.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v25.4s\n"
+ "fmax v9.4s, v9.4s, v25.4s\n"
+ "fmax v10.4s, v10.4s, v25.4s\n"
+ "fmax v11.4s, v11.4s, v25.4s\n"
+ "fmax v16.4s, v16.4s, v25.4s\n"
+ "fmax v17.4s, v17.4s, v25.4s\n"
+ "fmax v18.4s, v18.4s, v25.4s\n"
+ "fmax v19.4s, v19.4s, v25.4s\n"
"101:" // Height 3: No activation
"cmp x11, #0x10\n"
"bge 110f\n"
@@ -1617,14 +1616,14 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"127:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 128f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 129f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1634,9 +1633,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"b 129f\n"
"128:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"129:" // Height 4: input setup done
"cmp x27, #0x8\n"
"blt 132f\n"
@@ -1649,173 +1648,173 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 131f\n"
"130:" // Height 4: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"add x23, x23, #0x10\n"
"ldr q4, [x23, #0x0]\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
"cmp x27, #0x10\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"bge 130b\n"
"131:" // Height 4: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n"
"add x26, x26, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n"
+ ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x90]\n"
+ ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n"
+ ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n"
+ ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n"
+ ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n"
"132:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 137f\n"
"cmp x27, #0x4\n"
"blt 134f\n"
"133:" // Height 4: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"bge 133b\n"
"134:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 137f\n"
@@ -1836,33 +1835,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr h3, [x24, #0x0]\n"
"ldr h4, [x23, #0x0]\n"
"136:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n"
+ ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n"
+ ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n"
+ ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n"
"137:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1894,41 +1893,41 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"uzp2 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 138f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v7.4s, v7.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v7.4s, v7.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "fmin v7.4s, v7.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v25.4s\n"
+ "fmax v14.4s, v14.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v25.4s\n"
+ "fmax v9.4s, v9.4s, v25.4s\n"
+ "fmax v10.4s, v10.4s, v25.4s\n"
+ "fmax v11.4s, v11.4s, v25.4s\n"
+ "fmax v15.4s, v15.4s, v25.4s\n"
+ "fmax v20.4s, v20.4s, v25.4s\n"
+ "fmax v21.4s, v21.4s, v25.4s\n"
+ "fmax v22.4s, v22.4s, v25.4s\n"
+ "fmax v16.4s, v16.4s, v25.4s\n"
+ "fmax v17.4s, v17.4s, v25.4s\n"
+ "fmax v18.4s, v18.4s, v25.4s\n"
+ "fmax v19.4s, v19.4s, v25.4s\n"
"138:" // Height 4: No activation
"cmp x11, #0x10\n"
"bge 147f\n"
@@ -2290,15 +2289,15 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"164:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 165f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 166f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2309,10 +2308,10 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"b 166f\n"
"165:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"166:" // Height 5: input setup done
"cmp x27, #0x8\n"
"blt 169f\n"
@@ -2325,174 +2324,174 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q7, [x10, #0x0]\n"
"blt 168f\n"
"167:" // Height 5: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
"sub x27, x27, #0x8\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
"add x25, x25, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x40]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
"cmp x27, #0x10\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ "ldr q6, [x10, #0xa0]\n"
+ ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xb0]\n"
+ ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x10, #0xc0]\n"
+ ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xd0]\n"
+ ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x10, #0xe0]\n"
+ ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
+ ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n"
"ldr q5, [x22, #0x0]\n"
"bge 167b\n"
"168:" // Height 5: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
"add x26, x26, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n"
"add x24, x24, #0x10\n"
".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x40]\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n"
"add x22, x22, #0x10\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
"ldr q7, [x10, #0x80]\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
+ "ldr q0, [x10, #0xa0]\n"
+ ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x10, #0xb0]\n"
+ ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xc0]\n"
+ ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x10, #0xd0]\n"
+ ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xe0]\n"
+ ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n"
"ldr q6, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n"
".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -2502,48 +2501,48 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"blt 171f\n"
"170:" // Height 5: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d5, [x22], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ "ldr d0, [x22], #0x8\n"
+ "ldr q1, [x10, #0x0]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
+ ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
"cmp x27, #0x4\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x40]\n"
+ ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n"
"bge 170b\n"
"171:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 174f\n"
@@ -2567,42 +2566,42 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr h4, [x23, #0x0]\n"
"ldr h5, [x22, #0x0]\n"
"173:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
+ "ldr q6, [x10, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ "trn1 v2.2d, v5.2d, v0.2d\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x6e46ece8 // bfmmla v8.4s, v7.8h, v6.8h\n"
+ ".inst 0x6e46ec70 // bfmmla v16.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x40]\n"
+ ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x60]\n"
+ ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n"
"ldr q6, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n"
+ ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n"
"174:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3088,16 +3087,16 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"201:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 202f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 203f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -3109,11 +3108,11 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"b 203f\n"
"202:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"203:" // Height 6: input setup done
"cmp x27, #0x8\n"
"blt 206f\n"
@@ -3180,42 +3179,42 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr q2, [x25, #0x0]\n"
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
+ "ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xf0]\n"
+ "ldr q6, [x10, #0xa0]\n"
+ ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xb0]\n"
+ ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x10, #0xc0]\n"
+ ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xd0]\n"
+ ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n"
+ "ldr q6, [x10, #0xe0]\n"
+ ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
+ ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n"
"ldr q5, [x22, #0x0]\n"
"ldr q6, [x21, #0x0]\n"
"bge 204b\n"
@@ -3271,35 +3270,35 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x90]\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n"
".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n"
".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n"
- ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n"
+ "ldr q0, [x10, #0xa0]\n"
+ ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x10, #0xb0]\n"
+ ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xc0]\n"
+ ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n"
+ "ldr q2, [x10, #0xd0]\n"
+ ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n"
+ "ldr q0, [x10, #0xe0]\n"
+ ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n"
+ ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n"
+ ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n"
"ldr q6, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n"
- ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n"
- ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n"
+ ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n"
".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -3309,49 +3308,49 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"blt 208f\n"
"207:" // Height 6: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x4\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x4\n"
- "ldr d5, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
+ "ldr d1, [x22], #0x8\n"
+ "ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x40]\n"
+ ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n"
+ ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n"
"bge 207b\n"
"208:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 211f\n"
@@ -3378,42 +3377,42 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"ldr h5, [x22, #0x0]\n"
"ldr h6, [x21, #0x0]\n"
"210:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n"
+ "ldr q0, [x10, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n"
+ "trn1 v2.2d, v5.2d, v6.2d\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x6e40ec70 // bfmmla v16.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec58 // bfmmla v24.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x40]\n"
+ ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n"
+ "ldr q0, [x10, #0x60]\n"
+ ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n"
+ ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n"
"ldr q6, [x10, #0x70]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n"
- ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n"
+ ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n"
+ ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n"
+ ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n"
"211:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3678,7 +3677,6 @@ void a64_hybrid_bf16fp32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"224:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
index 8ce3d1b995..8b80c25beb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -79,12 +79,12 @@ public:
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
return { 6.94 };
+ default:
+ return { 14.53 };
case CPUModel::A510:
return { 8.94 };
case CPUModel::V1:
return { 29.26 };
- default:
- return { 14.53 };
}
}
@@ -108,5 +108,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
index 19636548a0..b049ed45f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
@@ -244,11 +244,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"23:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 24f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
"cbnz x15, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #1\n"
@@ -265,222 +265,222 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"blt 27f\n"
"26:" // Height 1: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr d6, [x17, #0x20]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr d17, [x17, #0x20]\n"
+ "ldr x20, [x17, #0x28]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x38]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "ldr x12, [x17, #0x48]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x58]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x78]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x98]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xb8]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xd8]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xf8]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr d6, [x17, #0x100]\n"
- "ldr x12, [x17, #0x108]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr d7, [x17, #0x110]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x118]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr d6, [x17, #0x120]\n"
- "ldr x12, [x17, #0x128]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr d7, [x17, #0x130]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x138]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr d6, [x17, #0x140]\n"
- "ldr x12, [x17, #0x148]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr d7, [x17, #0x150]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x158]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr d6, [x17, #0x160]\n"
- "ldr x12, [x17, #0x168]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr d7, [x17, #0x170]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x178]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr d6, [x17, #0x180]\n"
- "ldr x12, [x17, #0x188]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr d7, [x17, #0x190]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x198]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr d6, [x17, #0x1a0]\n"
- "ldr x12, [x17, #0x1a8]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr d7, [x17, #0x1b0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x1b8]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr d6, [x17, #0x1c0]\n"
- "ldr x12, [x17, #0x1c8]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr d7, [x17, #0x1d0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x1d8]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr d6, [x17, #0x1e0]\n"
- "ldr x12, [x17, #0x1e8]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr d7, [x17, #0x1f0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x1f8]\n"
- "mov v7.d[1], x11\n"
+ "ldr d16, [x17, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x38]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "ldr d17, [x17, #0x40]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr d16, [x17, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr d17, [x17, #0x60]\n"
+ "ldr x20, [x17, #0x68]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr d16, [x17, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x78]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "ldr d17, [x17, #0x80]\n"
+ "ldr x20, [x17, #0x88]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr d16, [x17, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr d17, [x17, #0xa0]\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr d16, [x17, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "ldr d17, [x17, #0xc0]\n"
+ "ldr x20, [x17, #0xc8]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr d16, [x17, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr d17, [x17, #0xe0]\n"
+ "ldr x20, [x17, #0xe8]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr d16, [x17, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "ldr d17, [x17, #0x100]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr d16, [x17, #0x110]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x118]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr d17, [x17, #0x120]\n"
+ "ldr x20, [x17, #0x128]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr d16, [x17, #0x130]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x138]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "ldr d17, [x17, #0x140]\n"
+ "ldr x20, [x17, #0x148]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr d16, [x17, #0x150]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x158]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr d17, [x17, #0x160]\n"
+ "ldr x20, [x17, #0x168]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr d16, [x17, #0x170]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x178]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "ldr d17, [x17, #0x180]\n"
+ "ldr x20, [x17, #0x188]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr d16, [x17, #0x190]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x198]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr d17, [x17, #0x1a0]\n"
+ "ldr x20, [x17, #0x1a8]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr d16, [x17, #0x1b0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x1b8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "ldr d17, [x17, #0x1c0]\n"
+ "ldr x20, [x17, #0x1c8]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr d16, [x17, #0x1d0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x1d8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr d17, [x17, #0x1e0]\n"
+ "ldr x20, [x17, #0x1e8]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr d16, [x17, #0x1f0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x1f8]\n"
+ "mov v16.d[1], x20\n"
"add x13, x13, #0x10\n"
"add x17, x17, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
"ldr d6, [x17, #0x0]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr x20, [x17, #0x8]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
"sub x14, x14, #0x8\n"
"ldr d7, [x17, #0x10]\n"
"cmp x14, #0x10\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x18]\n"
- "mov v0.d[1], x10\n"
- "mov v7.d[1], x11\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v0.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"prfm pldl1keep, [x13, #0x80]\n"
"bge 26b\n"
"27:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q17, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x17, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x17, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x17, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x17, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x17, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x17, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x17, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x17, #0x1f0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x17, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x17, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x17, #0x70]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x17, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x17, #0x90]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x17, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x17, #0xb0]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x17, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x17, #0xd0]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x17, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x17, #0xf0]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x17, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x17, #0x110]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x17, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x17, #0x130]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x17, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x17, #0x150]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x17, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x17, #0x170]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x17, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x17, #0x190]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x17, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x17, #0x1b0]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x17, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x17, #0x1d0]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr q17, [x17, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr q16, [x17, #0x1f0]\n"
"add x13, x13, #0x10\n"
"sub x14, x14, #0x8\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"add x17, x17, #0x200\n"
"28:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 30f\n"
"29:" // Height 1: Multiply loop: Odd block loop
"ldr h0, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v8.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x10]\n"
+ "fmla v9.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x20]\n"
+ "fmla v10.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
"add x17, x17, #0x40\n"
"cbnz x14, 29b\n"
"30:" // Height 1: Multiply loop: No odd multiplies
@@ -491,17 +491,17 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"prfm pstl1keep, [x16, #0x0]\n"
"tbz %x[flags], #1, 31f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v0.8h\n"
- "fmin v9.8h, v9.8h, v0.8h\n"
- "fmin v10.8h, v10.8h, v0.8h\n"
- "fmin v11.8h, v11.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v16.8h\n"
+ "fmin v9.8h, v9.8h, v16.8h\n"
+ "fmin v10.8h, v10.8h, v16.8h\n"
+ "fmin v11.8h, v11.8h, v16.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v16.8h\n"
+ "fmax v11.8h, v11.8h, v16.8h\n"
"31:" // Height 1: No activation
"cmp x8, #0x20\n"
"bge 48f\n"
@@ -799,324 +799,324 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"72:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
"cbnz x15, 74f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #1\n"
- "add x9, x9, x20, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n"
"b 74f\n"
"73:" // Height 2: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
"74:" // Height 2: input setup done
"cmp x14, #0x8\n"
"blt 77f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 76f\n"
"75:" // Height 2: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d17, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr x12, [x17, #0x48]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x58]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x98]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xd8]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr d6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr x12, [x17, #0x108]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr d7, [x17, #0x110]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x118]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr x12, [x17, #0x128]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr d6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr x11, [x17, #0x138]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr d7, [x17, #0x130]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr d6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr x12, [x17, #0x148]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr d7, [x17, #0x150]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x158]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr x12, [x17, #0x168]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr d6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr x11, [x17, #0x178]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr d7, [x17, #0x170]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr d6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr x12, [x17, #0x188]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr d7, [x17, #0x190]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x198]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr x12, [x17, #0x1a8]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr d6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr x11, [x17, #0x1b8]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr d7, [x17, #0x1b0]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr d6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr x12, [x17, #0x1c8]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr d7, [x17, #0x1d0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x1d8]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr x12, [x17, #0x1e8]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr d6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr x11, [x17, #0x1f8]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr d7, [x17, #0x1f0]\n"
- "mov v6.d[1], x12\n"
+ "ldr d16, [x17, #0x30]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[0]\n"
+ "ldr d17, [x17, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla v15.8h, v16.8h, v1.h[0]\n"
+ "ldr d16, [x17, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v12.8h, v17.8h, v1.h[1]\n"
+ "ldr d17, [x17, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v13.8h, v16.8h, v1.h[1]\n"
+ "ldr d16, [x17, #0x70]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[1]\n"
+ "ldr d17, [x17, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr x20, [x17, #0x88]\n"
+ "fmla v15.8h, v16.8h, v1.h[1]\n"
+ "ldr d16, [x17, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v12.8h, v17.8h, v1.h[2]\n"
+ "ldr d17, [x17, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v13.8h, v16.8h, v1.h[2]\n"
+ "ldr d16, [x17, #0xb0]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[2]\n"
+ "ldr d17, [x17, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr x20, [x17, #0xc8]\n"
+ "fmla v15.8h, v16.8h, v1.h[2]\n"
+ "ldr d16, [x17, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v12.8h, v17.8h, v1.h[3]\n"
+ "ldr d17, [x17, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v13.8h, v16.8h, v1.h[3]\n"
+ "ldr d16, [x17, #0xf0]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[3]\n"
+ "ldr d17, [x17, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla v15.8h, v16.8h, v1.h[3]\n"
+ "ldr d16, [x17, #0x110]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x118]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr x21, [x17, #0x128]\n"
+ "fmla v12.8h, v17.8h, v1.h[4]\n"
+ "ldr d17, [x17, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr x20, [x17, #0x138]\n"
+ "fmla v13.8h, v16.8h, v1.h[4]\n"
+ "ldr d16, [x17, #0x130]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[4]\n"
+ "ldr d17, [x17, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr x20, [x17, #0x148]\n"
+ "fmla v15.8h, v16.8h, v1.h[4]\n"
+ "ldr d16, [x17, #0x150]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x158]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr x21, [x17, #0x168]\n"
+ "fmla v12.8h, v17.8h, v1.h[5]\n"
+ "ldr d17, [x17, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr x20, [x17, #0x178]\n"
+ "fmla v13.8h, v16.8h, v1.h[5]\n"
+ "ldr d16, [x17, #0x170]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[5]\n"
+ "ldr d17, [x17, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr x20, [x17, #0x188]\n"
+ "fmla v15.8h, v16.8h, v1.h[5]\n"
+ "ldr d16, [x17, #0x190]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x198]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr x21, [x17, #0x1a8]\n"
+ "fmla v12.8h, v17.8h, v1.h[6]\n"
+ "ldr d17, [x17, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr x20, [x17, #0x1b8]\n"
+ "fmla v13.8h, v16.8h, v1.h[6]\n"
+ "ldr d16, [x17, #0x1b0]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.8h, v17.8h, v1.h[6]\n"
+ "ldr d17, [x17, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr x20, [x17, #0x1c8]\n"
+ "fmla v15.8h, v16.8h, v1.h[6]\n"
+ "ldr d16, [x17, #0x1d0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x1d8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr x21, [x17, #0x1e8]\n"
+ "fmla v12.8h, v17.8h, v1.h[7]\n"
+ "ldr d17, [x17, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr x20, [x17, #0x1f8]\n"
+ "fmla v13.8h, v16.8h, v1.h[7]\n"
+ "ldr d16, [x17, #0x1f0]\n"
+ "mov v17.d[1], x21\n"
"add x13, x13, #0x10\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"add x17, x17, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v14.8h, v17.8h, v1.h[7]\n"
"ldr d6, [x17, #0x0]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "ldr d1, [x9, #0x0]\n"
+ "fmla v15.8h, v16.8h, v1.h[7]\n"
+ "ldr d1, [x12, #0x0]\n"
"sub x14, x14, #0x8\n"
"ldr d7, [x17, #0x10]\n"
"cmp x14, #0x10\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x28, [x9, #0x8]\n"
- "mov v0.d[1], x10\n"
- "ldr x11, [x17, #0x18]\n"
- "mov v1.d[1], x28\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v1.d[1], x21\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v7.d[1], x11\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 75b\n"
"76:" // Height 2: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q17, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
"sub x14, x14, #0x8\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v14.8h, v17.8h, v1.h[0]\n"
+ "ldr q17, [x17, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x17, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x17, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x17, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x17, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x17, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x17, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x17, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x17, #0x1f0]\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v15.8h, v16.8h, v1.h[0]\n"
+ "ldr q16, [x17, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v12.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x17, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "fmla v13.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x17, #0x70]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "fmla v14.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x17, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "fmla v15.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x17, #0x90]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "fmla v12.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x17, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "fmla v13.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x17, #0xb0]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "fmla v14.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x17, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "fmla v15.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x17, #0xd0]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "fmla v12.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x17, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "fmla v13.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x17, #0xf0]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "fmla v14.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x17, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "fmla v15.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x17, #0x110]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "fmla v12.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x17, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "fmla v13.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x17, #0x130]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "fmla v14.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x17, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "fmla v15.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x17, #0x150]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "fmla v12.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x17, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "fmla v13.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x17, #0x170]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "fmla v14.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x17, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "fmla v15.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x17, #0x190]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "fmla v12.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x17, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "fmla v13.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x17, #0x1b0]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "fmla v14.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x17, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "fmla v15.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x17, #0x1d0]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "fmla v12.8h, v17.8h, v1.h[7]\n"
+ "ldr q17, [x17, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "fmla v13.8h, v16.8h, v1.h[7]\n"
+ "ldr q16, [x17, #0x1f0]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
"add x17, x17, #0x200\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v14.8h, v17.8h, v1.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
+ "fmla v15.8h, v16.8h, v1.h[7]\n"
"77:" // Height 2: Multiply loop: Main loop skip
"cbz x14, 79f\n"
"78:" // Height 2: Multiply loop: Odd block loop
- "ldr h0, [x13], #0x2\n"
+ "ldr h1, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr h1, [x9], #0x2\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr h0, [x12], #0x2\n"
+ "ldr q17, [x17, #0x0]\n"
+ "fmla v8.8h, v17.8h, v1.h[0]\n"
+ "ldr q16, [x17, #0x10]\n"
+ "fmla v12.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x17, #0x20]\n"
+ "fmla v9.8h, v16.8h, v1.h[0]\n"
+ "fmla v13.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v10.8h, v17.8h, v1.h[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v14.8h, v17.8h, v0.h[0]\n"
+ "fmla v11.8h, v16.8h, v1.h[0]\n"
+ "fmla v15.8h, v16.8h, v0.h[0]\n"
"cbnz x14, 78b\n"
"79:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1129,25 +1129,25 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v0.8h\n"
- "fmin v9.8h, v9.8h, v0.8h\n"
- "fmin v10.8h, v10.8h, v0.8h\n"
- "fmin v11.8h, v11.8h, v0.8h\n"
- "fmin v12.8h, v12.8h, v0.8h\n"
- "fmin v13.8h, v13.8h, v0.8h\n"
- "fmin v14.8h, v14.8h, v0.8h\n"
- "fmin v15.8h, v15.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v16.8h\n"
+ "fmin v9.8h, v9.8h, v16.8h\n"
+ "fmin v10.8h, v10.8h, v16.8h\n"
+ "fmin v11.8h, v11.8h, v16.8h\n"
+ "fmin v12.8h, v12.8h, v16.8h\n"
+ "fmin v13.8h, v13.8h, v16.8h\n"
+ "fmin v14.8h, v14.8h, v16.8h\n"
+ "fmin v15.8h, v15.8h, v16.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v16.8h\n"
+ "fmax v11.8h, v11.8h, v16.8h\n"
+ "fmax v12.8h, v12.8h, v16.8h\n"
+ "fmax v13.8h, v13.8h, v16.8h\n"
+ "fmax v14.8h, v14.8h, v16.8h\n"
+ "fmax v15.8h, v15.8h, v16.8h\n"
"80:" // Height 2: No activation
"cmp x8, #0x20\n"
"bge 97f\n"
@@ -1526,404 +1526,404 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"121:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 122f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
"cbnz x15, 123f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #1\n"
- "add x9, x9, x20, LSL #1\n"
- "add x27, x27, x20, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n"
+ "add x11, x11, x20, LSL #1\n"
"b 123f\n"
"122:" // Height 3: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #1\n"
- "add x27, x9, x20, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
+ "add x11, x12, x21, LSL #1\n"
"123:" // Height 3: input setup done
"cmp x14, #0x8\n"
"blt 126f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 125f\n"
"124:" // Height 3: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d21, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v21.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x58]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x98]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xd8]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x108]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0x118]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr d6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x128]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr d7, [x17, #0x110]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x138]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr d6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x148]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr d7, [x17, #0x130]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x158]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr d6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x168]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr d7, [x17, #0x150]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x178]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr d6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x188]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr d7, [x17, #0x170]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x198]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr d6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x1a8]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr d7, [x17, #0x190]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1b8]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr d6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1c8]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr d7, [x17, #0x1b0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1d8]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr d6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1e8]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr d7, [x17, #0x1d0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x1f8]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr d6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr d20, [x17, #0x30]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmla v18.8h, v21.8h, v2.h[0]\n"
+ "ldr d21, [x17, #0x40]\n"
+ "fmla v11.8h, v20.8h, v0.h[0]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v19.8h, v20.8h, v2.h[0]\n"
+ "ldr d20, [x17, #0x50]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[1]\n"
+ "fmla v12.8h, v21.8h, v1.h[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v16.8h, v21.8h, v2.h[1]\n"
+ "ldr d21, [x17, #0x60]\n"
+ "fmla v9.8h, v20.8h, v0.h[1]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla v17.8h, v20.8h, v2.h[1]\n"
+ "ldr d20, [x17, #0x70]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[1]\n"
+ "fmla v14.8h, v21.8h, v1.h[1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla v18.8h, v21.8h, v2.h[1]\n"
+ "ldr d21, [x17, #0x80]\n"
+ "fmla v11.8h, v20.8h, v0.h[1]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v19.8h, v20.8h, v2.h[1]\n"
+ "ldr d20, [x17, #0x90]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[2]\n"
+ "fmla v12.8h, v21.8h, v1.h[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v16.8h, v21.8h, v2.h[2]\n"
+ "ldr d21, [x17, #0xa0]\n"
+ "fmla v9.8h, v20.8h, v0.h[2]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla v17.8h, v20.8h, v2.h[2]\n"
+ "ldr d20, [x17, #0xb0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[2]\n"
+ "fmla v14.8h, v21.8h, v1.h[2]\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "fmla v18.8h, v21.8h, v2.h[2]\n"
+ "ldr d21, [x17, #0xc0]\n"
+ "fmla v11.8h, v20.8h, v0.h[2]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v19.8h, v20.8h, v2.h[2]\n"
+ "ldr d20, [x17, #0xd0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[3]\n"
+ "fmla v12.8h, v21.8h, v1.h[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v16.8h, v21.8h, v2.h[3]\n"
+ "ldr d21, [x17, #0xe0]\n"
+ "fmla v9.8h, v20.8h, v0.h[3]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
+ "fmla v17.8h, v20.8h, v2.h[3]\n"
+ "ldr d20, [x17, #0xf0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[3]\n"
+ "fmla v14.8h, v21.8h, v1.h[3]\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla v18.8h, v21.8h, v2.h[3]\n"
+ "ldr d21, [x17, #0x100]\n"
+ "fmla v11.8h, v20.8h, v0.h[3]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
+ "fmla v19.8h, v20.8h, v2.h[3]\n"
+ "ldr d20, [x17, #0x110]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[4]\n"
+ "fmla v12.8h, v21.8h, v1.h[4]\n"
+ "ldr x20, [x17, #0x138]\n"
+ "fmla v16.8h, v21.8h, v2.h[4]\n"
+ "ldr d21, [x17, #0x120]\n"
+ "fmla v9.8h, v20.8h, v0.h[4]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
+ "fmla v17.8h, v20.8h, v2.h[4]\n"
+ "ldr d20, [x17, #0x130]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[4]\n"
+ "fmla v14.8h, v21.8h, v1.h[4]\n"
+ "ldr x20, [x17, #0x158]\n"
+ "fmla v18.8h, v21.8h, v2.h[4]\n"
+ "ldr d21, [x17, #0x140]\n"
+ "fmla v11.8h, v20.8h, v0.h[4]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
+ "fmla v19.8h, v20.8h, v2.h[4]\n"
+ "ldr d20, [x17, #0x150]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[5]\n"
+ "fmla v12.8h, v21.8h, v1.h[5]\n"
+ "ldr x20, [x17, #0x178]\n"
+ "fmla v16.8h, v21.8h, v2.h[5]\n"
+ "ldr d21, [x17, #0x160]\n"
+ "fmla v9.8h, v20.8h, v0.h[5]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
+ "fmla v17.8h, v20.8h, v2.h[5]\n"
+ "ldr d20, [x17, #0x170]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[5]\n"
+ "fmla v14.8h, v21.8h, v1.h[5]\n"
+ "ldr x20, [x17, #0x198]\n"
+ "fmla v18.8h, v21.8h, v2.h[5]\n"
+ "ldr d21, [x17, #0x180]\n"
+ "fmla v11.8h, v20.8h, v0.h[5]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
+ "fmla v19.8h, v20.8h, v2.h[5]\n"
+ "ldr d20, [x17, #0x190]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[6]\n"
+ "fmla v12.8h, v21.8h, v1.h[6]\n"
+ "ldr x20, [x17, #0x1b8]\n"
+ "fmla v16.8h, v21.8h, v2.h[6]\n"
+ "ldr d21, [x17, #0x1a0]\n"
+ "fmla v9.8h, v20.8h, v0.h[6]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
+ "fmla v17.8h, v20.8h, v2.h[6]\n"
+ "ldr d20, [x17, #0x1b0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.8h, v21.8h, v0.h[6]\n"
+ "fmla v14.8h, v21.8h, v1.h[6]\n"
+ "ldr x20, [x17, #0x1d8]\n"
+ "fmla v18.8h, v21.8h, v2.h[6]\n"
+ "ldr d21, [x17, #0x1c0]\n"
+ "fmla v11.8h, v20.8h, v0.h[6]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.8h, v20.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
+ "fmla v19.8h, v20.8h, v2.h[6]\n"
+ "ldr d20, [x17, #0x1d0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.8h, v21.8h, v0.h[7]\n"
+ "fmla v12.8h, v21.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x1f8]\n"
+ "fmla v16.8h, v21.8h, v2.h[7]\n"
+ "ldr d21, [x17, #0x1e0]\n"
+ "fmla v9.8h, v20.8h, v0.h[7]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.8h, v20.8h, v1.h[7]\n"
"add x13, x13, #0x10\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr d7, [x17, #0x1f0]\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
+ "fmla v17.8h, v20.8h, v2.h[7]\n"
+ "ldr d20, [x17, #0x1f0]\n"
+ "mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"add x17, x17, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "ldr x10, [x13, #0x8]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v10.8h, v21.8h, v0.h[7]\n"
+ "ldr x20, [x17, #0x8]\n"
+ "fmla v14.8h, v21.8h, v1.h[7]\n"
+ "ldr x23, [x13, #0x8]\n"
+ "fmla v18.8h, v21.8h, v2.h[7]\n"
"ldr d6, [x17, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v20.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "ldr d1, [x9, #0x0]\n"
- "ldr x28, [x9, #0x8]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
+ "fmla v15.8h, v20.8h, v1.h[7]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
+ "fmla v19.8h, v20.8h, v2.h[7]\n"
+ "ldr d2, [x11, #0x0]\n"
"sub x14, x14, #0x8\n"
"ldr d7, [x17, #0x10]\n"
"cmp x14, #0x10\n"
- "ldr x26, [x27, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x18]\n"
- "mov v0.d[1], x10\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v0.d[1], x23\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v1.d[1], x28\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "mov v2.d[1], x26\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "mov v7.d[1], x11\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v7.d[1], x20\n"
"bge 124b\n"
"125:" // Height 3: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q21, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"sub x14, x14, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q20, [x17, #0x30]\n"
+ "fmla v10.8h, v21.8h, v0.h[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x17, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x17, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x17, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x17, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x17, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x17, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x17, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x17, #0x1f0]\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v18.8h, v21.8h, v2.h[0]\n"
+ "ldr q21, [x17, #0x40]\n"
+ "fmla v11.8h, v20.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v2.h[0]\n"
+ "ldr q20, [x17, #0x50]\n"
+ "fmla v8.8h, v21.8h, v0.h[1]\n"
+ "fmla v12.8h, v21.8h, v1.h[1]\n"
+ "fmla v16.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x17, #0x60]\n"
+ "fmla v9.8h, v20.8h, v0.h[1]\n"
+ "fmla v13.8h, v20.8h, v1.h[1]\n"
+ "fmla v17.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x17, #0x70]\n"
+ "fmla v10.8h, v21.8h, v0.h[1]\n"
+ "fmla v14.8h, v21.8h, v1.h[1]\n"
+ "fmla v18.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x17, #0x80]\n"
+ "fmla v11.8h, v20.8h, v0.h[1]\n"
+ "fmla v15.8h, v20.8h, v1.h[1]\n"
+ "fmla v19.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x17, #0x90]\n"
+ "fmla v8.8h, v21.8h, v0.h[2]\n"
+ "fmla v12.8h, v21.8h, v1.h[2]\n"
+ "fmla v16.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x17, #0xa0]\n"
+ "fmla v9.8h, v20.8h, v0.h[2]\n"
+ "fmla v13.8h, v20.8h, v1.h[2]\n"
+ "fmla v17.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x17, #0xb0]\n"
+ "fmla v10.8h, v21.8h, v0.h[2]\n"
+ "fmla v14.8h, v21.8h, v1.h[2]\n"
+ "fmla v18.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x17, #0xc0]\n"
+ "fmla v11.8h, v20.8h, v0.h[2]\n"
+ "fmla v15.8h, v20.8h, v1.h[2]\n"
+ "fmla v19.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x17, #0xd0]\n"
+ "fmla v8.8h, v21.8h, v0.h[3]\n"
+ "fmla v12.8h, v21.8h, v1.h[3]\n"
+ "fmla v16.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x17, #0xe0]\n"
+ "fmla v9.8h, v20.8h, v0.h[3]\n"
+ "fmla v13.8h, v20.8h, v1.h[3]\n"
+ "fmla v17.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x17, #0xf0]\n"
+ "fmla v10.8h, v21.8h, v0.h[3]\n"
+ "fmla v14.8h, v21.8h, v1.h[3]\n"
+ "fmla v18.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x17, #0x100]\n"
+ "fmla v11.8h, v20.8h, v0.h[3]\n"
+ "fmla v15.8h, v20.8h, v1.h[3]\n"
+ "fmla v19.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x17, #0x110]\n"
+ "fmla v8.8h, v21.8h, v0.h[4]\n"
+ "fmla v12.8h, v21.8h, v1.h[4]\n"
+ "fmla v16.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x17, #0x120]\n"
+ "fmla v9.8h, v20.8h, v0.h[4]\n"
+ "fmla v13.8h, v20.8h, v1.h[4]\n"
+ "fmla v17.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x17, #0x130]\n"
+ "fmla v10.8h, v21.8h, v0.h[4]\n"
+ "fmla v14.8h, v21.8h, v1.h[4]\n"
+ "fmla v18.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x17, #0x140]\n"
+ "fmla v11.8h, v20.8h, v0.h[4]\n"
+ "fmla v15.8h, v20.8h, v1.h[4]\n"
+ "fmla v19.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x17, #0x150]\n"
+ "fmla v8.8h, v21.8h, v0.h[5]\n"
+ "fmla v12.8h, v21.8h, v1.h[5]\n"
+ "fmla v16.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x17, #0x160]\n"
+ "fmla v9.8h, v20.8h, v0.h[5]\n"
+ "fmla v13.8h, v20.8h, v1.h[5]\n"
+ "fmla v17.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x17, #0x170]\n"
+ "fmla v10.8h, v21.8h, v0.h[5]\n"
+ "fmla v14.8h, v21.8h, v1.h[5]\n"
+ "fmla v18.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x17, #0x180]\n"
+ "fmla v11.8h, v20.8h, v0.h[5]\n"
+ "fmla v15.8h, v20.8h, v1.h[5]\n"
+ "fmla v19.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x17, #0x190]\n"
+ "fmla v8.8h, v21.8h, v0.h[6]\n"
+ "fmla v12.8h, v21.8h, v1.h[6]\n"
+ "fmla v16.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x17, #0x1a0]\n"
+ "fmla v9.8h, v20.8h, v0.h[6]\n"
+ "fmla v13.8h, v20.8h, v1.h[6]\n"
+ "fmla v17.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x17, #0x1b0]\n"
+ "fmla v10.8h, v21.8h, v0.h[6]\n"
+ "fmla v14.8h, v21.8h, v1.h[6]\n"
+ "fmla v18.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x17, #0x1c0]\n"
+ "fmla v11.8h, v20.8h, v0.h[6]\n"
+ "fmla v15.8h, v20.8h, v1.h[6]\n"
+ "fmla v19.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x17, #0x1d0]\n"
+ "fmla v8.8h, v21.8h, v0.h[7]\n"
+ "fmla v12.8h, v21.8h, v1.h[7]\n"
+ "fmla v16.8h, v21.8h, v2.h[7]\n"
+ "ldr q21, [x17, #0x1e0]\n"
+ "fmla v9.8h, v20.8h, v0.h[7]\n"
+ "fmla v13.8h, v20.8h, v1.h[7]\n"
+ "fmla v17.8h, v20.8h, v2.h[7]\n"
+ "ldr q20, [x17, #0x1f0]\n"
+ "fmla v10.8h, v21.8h, v0.h[7]\n"
"add x17, x17, #0x200\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v14.8h, v21.8h, v1.h[7]\n"
+ "fmla v18.8h, v21.8h, v2.h[7]\n"
+ "fmla v11.8h, v20.8h, v0.h[7]\n"
+ "fmla v15.8h, v20.8h, v1.h[7]\n"
+ "fmla v19.8h, v20.8h, v2.h[7]\n"
"126:" // Height 3: Multiply loop: Main loop skip
"cbz x14, 128f\n"
"127:" // Height 3: Multiply loop: Odd block loop
- "ldr h0, [x13], #0x2\n"
+ "ldr h2, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h0, [x11], #0x2\n"
+ "ldr q21, [x17, #0x0]\n"
+ "fmla v8.8h, v21.8h, v2.h[0]\n"
+ "ldr q20, [x17, #0x10]\n"
+ "fmla v12.8h, v21.8h, v1.h[0]\n"
+ "fmla v16.8h, v21.8h, v0.h[0]\n"
+ "ldr q21, [x17, #0x20]\n"
+ "fmla v9.8h, v20.8h, v2.h[0]\n"
+ "fmla v13.8h, v20.8h, v1.h[0]\n"
+ "fmla v17.8h, v20.8h, v0.h[0]\n"
+ "ldr q20, [x17, #0x30]\n"
+ "fmla v10.8h, v21.8h, v2.h[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
+ "fmla v18.8h, v21.8h, v0.h[0]\n"
+ "fmla v11.8h, v20.8h, v2.h[0]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v0.h[0]\n"
"cbnz x14, 127b\n"
"128:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1938,33 +1938,33 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 129f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v0.8h\n"
- "fmin v9.8h, v9.8h, v0.8h\n"
- "fmin v10.8h, v10.8h, v0.8h\n"
- "fmin v11.8h, v11.8h, v0.8h\n"
- "fmin v12.8h, v12.8h, v0.8h\n"
- "fmin v13.8h, v13.8h, v0.8h\n"
- "fmin v14.8h, v14.8h, v0.8h\n"
- "fmin v15.8h, v15.8h, v0.8h\n"
- "fmin v16.8h, v16.8h, v0.8h\n"
- "fmin v17.8h, v17.8h, v0.8h\n"
- "fmin v18.8h, v18.8h, v0.8h\n"
- "fmin v19.8h, v19.8h, v0.8h\n"
+ "ld1r { v20.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v20.8h\n"
+ "fmin v9.8h, v9.8h, v20.8h\n"
+ "fmin v10.8h, v10.8h, v20.8h\n"
+ "fmin v11.8h, v11.8h, v20.8h\n"
+ "fmin v12.8h, v12.8h, v20.8h\n"
+ "fmin v13.8h, v13.8h, v20.8h\n"
+ "fmin v14.8h, v14.8h, v20.8h\n"
+ "fmin v15.8h, v15.8h, v20.8h\n"
+ "fmin v16.8h, v16.8h, v20.8h\n"
+ "fmin v17.8h, v17.8h, v20.8h\n"
+ "fmin v18.8h, v18.8h, v20.8h\n"
+ "fmin v19.8h, v19.8h, v20.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
+ "ld1r { v20.8h }, [x20]\n"
+ "fmax v8.8h, v8.8h, v20.8h\n"
+ "fmax v9.8h, v9.8h, v20.8h\n"
+ "fmax v10.8h, v10.8h, v20.8h\n"
+ "fmax v11.8h, v11.8h, v20.8h\n"
+ "fmax v12.8h, v12.8h, v20.8h\n"
+ "fmax v13.8h, v13.8h, v20.8h\n"
+ "fmax v14.8h, v14.8h, v20.8h\n"
+ "fmax v15.8h, v15.8h, v20.8h\n"
+ "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
+ "fmax v18.8h, v18.8h, v20.8h\n"
+ "fmax v19.8h, v19.8h, v20.8h\n"
"129:" // Height 3: No activation
"cmp x8, #0x20\n"
"bge 146f\n"
@@ -2424,484 +2424,484 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"170:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 171f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
"cbnz x15, 172f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #1\n"
- "add x9, x9, x20, LSL #1\n"
- "add x27, x27, x20, LSL #1\n"
- "add x25, x25, x20, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n"
+ "add x11, x11, x20, LSL #1\n"
+ "add x10, x10, x20, LSL #1\n"
"b 172f\n"
"171:" // Height 4: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #1\n"
- "add x27, x9, x20, LSL #1\n"
- "add x25, x27, x20, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
+ "add x11, x12, x21, LSL #1\n"
+ "add x10, x11, x21, LSL #1\n"
"172:" // Height 4: input setup done
"cmp x14, #0x8\n"
"blt 175f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 174f\n"
"173:" // Height 4: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d25, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v25.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x58]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "add x27, x27, #0x10\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "add x25, x25, #0x10\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr x10, [x13, #0x8]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr x28, [x9, #0x8]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x98]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr x26, [x27, #0x8]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr x24, [x25, #0x8]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "ldr d24, [x17, #0x30]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[0]\n"
+ "fmla v14.8h, v25.8h, v1.h[0]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmla v18.8h, v25.8h, v2.h[0]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v22.8h, v25.8h, v3.h[0]\n"
+ "ldr d25, [x17, #0x40]\n"
+ "fmla v11.8h, v24.8h, v0.h[0]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[0]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v19.8h, v24.8h, v2.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v23.8h, v24.8h, v3.h[0]\n"
+ "ldr d24, [x17, #0x50]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[1]\n"
+ "fmla v12.8h, v25.8h, v1.h[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v16.8h, v25.8h, v2.h[1]\n"
+ "ldr x25, [x13, #0x8]\n"
+ "fmla v20.8h, v25.8h, v3.h[1]\n"
+ "ldr d25, [x17, #0x60]\n"
+ "fmla v9.8h, v24.8h, v0.h[1]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla v17.8h, v24.8h, v2.h[1]\n"
+ "ldr x24, [x12, #0x8]\n"
+ "fmla v21.8h, v24.8h, v3.h[1]\n"
+ "ldr d24, [x17, #0x70]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[1]\n"
+ "fmla v14.8h, v25.8h, v1.h[1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla v18.8h, v25.8h, v2.h[1]\n"
+ "ldr x23, [x11, #0x8]\n"
+ "fmla v22.8h, v25.8h, v3.h[1]\n"
+ "ldr d25, [x17, #0x80]\n"
+ "fmla v11.8h, v24.8h, v0.h[1]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v19.8h, v24.8h, v2.h[1]\n"
+ "ldr x22, [x10, #0x8]\n"
+ "fmla v23.8h, v24.8h, v3.h[1]\n"
+ "ldr d24, [x17, #0x90]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[2]\n"
+ "fmla v12.8h, v25.8h, v1.h[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v16.8h, v25.8h, v2.h[2]\n"
"sub x14, x14, #0x8\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v20.8h, v25.8h, v3.h[2]\n"
+ "ldr d25, [x17, #0xa0]\n"
+ "fmla v9.8h, v24.8h, v0.h[2]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla v17.8h, v24.8h, v2.h[2]\n"
"cmp x14, #0x10\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xd8]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v21.8h, v24.8h, v3.h[2]\n"
+ "ldr d24, [x17, #0xb0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[2]\n"
+ "fmla v14.8h, v25.8h, v1.h[2]\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "fmla v18.8h, v25.8h, v2.h[2]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x108]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0x118]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr d6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x128]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr d7, [x17, #0x110]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x138]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr d6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x148]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr d7, [x17, #0x130]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x158]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr d6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x168]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr d7, [x17, #0x150]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x178]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr d6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x188]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr d7, [x17, #0x170]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x198]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr d6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x1a8]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr d7, [x17, #0x190]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1b8]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr d6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1c8]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr d7, [x17, #0x1b0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1d8]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr d6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1e8]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr d7, [x17, #0x1d0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x1f8]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr d6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr d7, [x17, #0x1f0]\n"
- "mov v7.d[1], x11\n"
+ "fmla v22.8h, v25.8h, v3.h[2]\n"
+ "ldr d25, [x17, #0xc0]\n"
+ "fmla v11.8h, v24.8h, v0.h[2]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v19.8h, v24.8h, v2.h[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v23.8h, v24.8h, v3.h[2]\n"
+ "ldr d24, [x17, #0xd0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[3]\n"
+ "fmla v12.8h, v25.8h, v1.h[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v16.8h, v25.8h, v2.h[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v20.8h, v25.8h, v3.h[3]\n"
+ "ldr d25, [x17, #0xe0]\n"
+ "fmla v9.8h, v24.8h, v0.h[3]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
+ "fmla v17.8h, v24.8h, v2.h[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v21.8h, v24.8h, v3.h[3]\n"
+ "ldr d24, [x17, #0xf0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[3]\n"
+ "fmla v14.8h, v25.8h, v1.h[3]\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla v18.8h, v25.8h, v2.h[3]\n"
+ "fmla v22.8h, v25.8h, v3.h[3]\n"
+ "ldr d25, [x17, #0x100]\n"
+ "fmla v11.8h, v24.8h, v0.h[3]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
+ "fmla v19.8h, v24.8h, v2.h[3]\n"
+ "fmla v23.8h, v24.8h, v3.h[3]\n"
+ "ldr d24, [x17, #0x110]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[4]\n"
+ "fmla v12.8h, v25.8h, v1.h[4]\n"
+ "ldr x20, [x17, #0x138]\n"
+ "fmla v16.8h, v25.8h, v2.h[4]\n"
+ "fmla v20.8h, v25.8h, v3.h[4]\n"
+ "ldr d25, [x17, #0x120]\n"
+ "fmla v9.8h, v24.8h, v0.h[4]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
+ "fmla v17.8h, v24.8h, v2.h[4]\n"
+ "fmla v21.8h, v24.8h, v3.h[4]\n"
+ "ldr d24, [x17, #0x130]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[4]\n"
+ "fmla v14.8h, v25.8h, v1.h[4]\n"
+ "ldr x20, [x17, #0x158]\n"
+ "fmla v18.8h, v25.8h, v2.h[4]\n"
+ "fmla v22.8h, v25.8h, v3.h[4]\n"
+ "ldr d25, [x17, #0x140]\n"
+ "fmla v11.8h, v24.8h, v0.h[4]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
+ "fmla v19.8h, v24.8h, v2.h[4]\n"
+ "fmla v23.8h, v24.8h, v3.h[4]\n"
+ "ldr d24, [x17, #0x150]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[5]\n"
+ "fmla v12.8h, v25.8h, v1.h[5]\n"
+ "ldr x20, [x17, #0x178]\n"
+ "fmla v16.8h, v25.8h, v2.h[5]\n"
+ "fmla v20.8h, v25.8h, v3.h[5]\n"
+ "ldr d25, [x17, #0x160]\n"
+ "fmla v9.8h, v24.8h, v0.h[5]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
+ "fmla v17.8h, v24.8h, v2.h[5]\n"
+ "fmla v21.8h, v24.8h, v3.h[5]\n"
+ "ldr d24, [x17, #0x170]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[5]\n"
+ "fmla v14.8h, v25.8h, v1.h[5]\n"
+ "ldr x20, [x17, #0x198]\n"
+ "fmla v18.8h, v25.8h, v2.h[5]\n"
+ "fmla v22.8h, v25.8h, v3.h[5]\n"
+ "ldr d25, [x17, #0x180]\n"
+ "fmla v11.8h, v24.8h, v0.h[5]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
+ "fmla v19.8h, v24.8h, v2.h[5]\n"
+ "fmla v23.8h, v24.8h, v3.h[5]\n"
+ "ldr d24, [x17, #0x190]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[6]\n"
+ "fmla v12.8h, v25.8h, v1.h[6]\n"
+ "ldr x20, [x17, #0x1b8]\n"
+ "fmla v16.8h, v25.8h, v2.h[6]\n"
+ "fmla v20.8h, v25.8h, v3.h[6]\n"
+ "ldr d25, [x17, #0x1a0]\n"
+ "fmla v9.8h, v24.8h, v0.h[6]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
+ "fmla v17.8h, v24.8h, v2.h[6]\n"
+ "fmla v21.8h, v24.8h, v3.h[6]\n"
+ "ldr d24, [x17, #0x1b0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.8h, v25.8h, v0.h[6]\n"
+ "fmla v14.8h, v25.8h, v1.h[6]\n"
+ "ldr x20, [x17, #0x1d8]\n"
+ "fmla v18.8h, v25.8h, v2.h[6]\n"
+ "fmla v22.8h, v25.8h, v3.h[6]\n"
+ "ldr d25, [x17, #0x1c0]\n"
+ "fmla v11.8h, v24.8h, v0.h[6]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.8h, v24.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
+ "fmla v19.8h, v24.8h, v2.h[6]\n"
+ "fmla v23.8h, v24.8h, v3.h[6]\n"
+ "ldr d24, [x17, #0x1d0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.8h, v25.8h, v0.h[7]\n"
+ "fmla v12.8h, v25.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x1f8]\n"
+ "fmla v16.8h, v25.8h, v2.h[7]\n"
+ "fmla v20.8h, v25.8h, v3.h[7]\n"
+ "ldr d25, [x17, #0x1e0]\n"
+ "fmla v9.8h, v24.8h, v0.h[7]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.8h, v24.8h, v1.h[7]\n"
+ "fmla v17.8h, v24.8h, v2.h[7]\n"
+ "fmla v21.8h, v24.8h, v3.h[7]\n"
+ "ldr d24, [x17, #0x1f0]\n"
+ "mov v24.d[1], x20\n"
"add x17, x17, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x18]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v10.8h, v25.8h, v0.h[7]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v14.8h, v25.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x18]\n"
+ "fmla v18.8h, v25.8h, v2.h[7]\n"
+ "fmla v22.8h, v25.8h, v3.h[7]\n"
"ldr d6, [x17, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v24.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "ldr d1, [x9, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
+ "fmla v15.8h, v24.8h, v1.h[7]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "fmla v19.8h, v24.8h, v2.h[7]\n"
+ "ldr d2, [x11, #0x0]\n"
+ "fmla v23.8h, v24.8h, v3.h[7]\n"
+ "ldr d3, [x10, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
- "mov v7.d[1], x11\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x25\n"
+ "mov v1.d[1], x24\n"
+ "mov v2.d[1], x23\n"
+ "mov v3.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 173b\n"
"174:" // Height 4: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q25, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"sub x14, x14, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x17, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x17, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x17, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x17, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x17, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x17, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x17, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x17, #0x1f0]\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "ldr q24, [x17, #0x30]\n"
+ "fmla v10.8h, v25.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v14.8h, v25.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v18.8h, v25.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v22.8h, v25.8h, v3.h[0]\n"
+ "ldr q25, [x17, #0x40]\n"
+ "fmla v11.8h, v24.8h, v0.h[0]\n"
+ "fmla v15.8h, v24.8h, v1.h[0]\n"
+ "fmla v19.8h, v24.8h, v2.h[0]\n"
+ "fmla v23.8h, v24.8h, v3.h[0]\n"
+ "ldr q24, [x17, #0x50]\n"
+ "fmla v8.8h, v25.8h, v0.h[1]\n"
+ "fmla v12.8h, v25.8h, v1.h[1]\n"
+ "fmla v16.8h, v25.8h, v2.h[1]\n"
+ "fmla v20.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x17, #0x60]\n"
+ "fmla v9.8h, v24.8h, v0.h[1]\n"
+ "fmla v13.8h, v24.8h, v1.h[1]\n"
+ "fmla v17.8h, v24.8h, v2.h[1]\n"
+ "fmla v21.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x17, #0x70]\n"
+ "fmla v10.8h, v25.8h, v0.h[1]\n"
+ "fmla v14.8h, v25.8h, v1.h[1]\n"
+ "fmla v18.8h, v25.8h, v2.h[1]\n"
+ "fmla v22.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x17, #0x80]\n"
+ "fmla v11.8h, v24.8h, v0.h[1]\n"
+ "fmla v15.8h, v24.8h, v1.h[1]\n"
+ "fmla v19.8h, v24.8h, v2.h[1]\n"
+ "fmla v23.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x17, #0x90]\n"
+ "fmla v8.8h, v25.8h, v0.h[2]\n"
+ "fmla v12.8h, v25.8h, v1.h[2]\n"
+ "fmla v16.8h, v25.8h, v2.h[2]\n"
+ "fmla v20.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x17, #0xa0]\n"
+ "fmla v9.8h, v24.8h, v0.h[2]\n"
+ "fmla v13.8h, v24.8h, v1.h[2]\n"
+ "fmla v17.8h, v24.8h, v2.h[2]\n"
+ "fmla v21.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x17, #0xb0]\n"
+ "fmla v10.8h, v25.8h, v0.h[2]\n"
+ "fmla v14.8h, v25.8h, v1.h[2]\n"
+ "fmla v18.8h, v25.8h, v2.h[2]\n"
+ "fmla v22.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x17, #0xc0]\n"
+ "fmla v11.8h, v24.8h, v0.h[2]\n"
+ "fmla v15.8h, v24.8h, v1.h[2]\n"
+ "fmla v19.8h, v24.8h, v2.h[2]\n"
+ "fmla v23.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x17, #0xd0]\n"
+ "fmla v8.8h, v25.8h, v0.h[3]\n"
+ "fmla v12.8h, v25.8h, v1.h[3]\n"
+ "fmla v16.8h, v25.8h, v2.h[3]\n"
+ "fmla v20.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x17, #0xe0]\n"
+ "fmla v9.8h, v24.8h, v0.h[3]\n"
+ "fmla v13.8h, v24.8h, v1.h[3]\n"
+ "fmla v17.8h, v24.8h, v2.h[3]\n"
+ "fmla v21.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x17, #0xf0]\n"
+ "fmla v10.8h, v25.8h, v0.h[3]\n"
+ "fmla v14.8h, v25.8h, v1.h[3]\n"
+ "fmla v18.8h, v25.8h, v2.h[3]\n"
+ "fmla v22.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x17, #0x100]\n"
+ "fmla v11.8h, v24.8h, v0.h[3]\n"
+ "fmla v15.8h, v24.8h, v1.h[3]\n"
+ "fmla v19.8h, v24.8h, v2.h[3]\n"
+ "fmla v23.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x17, #0x110]\n"
+ "fmla v8.8h, v25.8h, v0.h[4]\n"
+ "fmla v12.8h, v25.8h, v1.h[4]\n"
+ "fmla v16.8h, v25.8h, v2.h[4]\n"
+ "fmla v20.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x17, #0x120]\n"
+ "fmla v9.8h, v24.8h, v0.h[4]\n"
+ "fmla v13.8h, v24.8h, v1.h[4]\n"
+ "fmla v17.8h, v24.8h, v2.h[4]\n"
+ "fmla v21.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x17, #0x130]\n"
+ "fmla v10.8h, v25.8h, v0.h[4]\n"
+ "fmla v14.8h, v25.8h, v1.h[4]\n"
+ "fmla v18.8h, v25.8h, v2.h[4]\n"
+ "fmla v22.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x17, #0x140]\n"
+ "fmla v11.8h, v24.8h, v0.h[4]\n"
+ "fmla v15.8h, v24.8h, v1.h[4]\n"
+ "fmla v19.8h, v24.8h, v2.h[4]\n"
+ "fmla v23.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x17, #0x150]\n"
+ "fmla v8.8h, v25.8h, v0.h[5]\n"
+ "fmla v12.8h, v25.8h, v1.h[5]\n"
+ "fmla v16.8h, v25.8h, v2.h[5]\n"
+ "fmla v20.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x17, #0x160]\n"
+ "fmla v9.8h, v24.8h, v0.h[5]\n"
+ "fmla v13.8h, v24.8h, v1.h[5]\n"
+ "fmla v17.8h, v24.8h, v2.h[5]\n"
+ "fmla v21.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x17, #0x170]\n"
+ "fmla v10.8h, v25.8h, v0.h[5]\n"
+ "fmla v14.8h, v25.8h, v1.h[5]\n"
+ "fmla v18.8h, v25.8h, v2.h[5]\n"
+ "fmla v22.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x17, #0x180]\n"
+ "fmla v11.8h, v24.8h, v0.h[5]\n"
+ "fmla v15.8h, v24.8h, v1.h[5]\n"
+ "fmla v19.8h, v24.8h, v2.h[5]\n"
+ "fmla v23.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x17, #0x190]\n"
+ "fmla v8.8h, v25.8h, v0.h[6]\n"
+ "fmla v12.8h, v25.8h, v1.h[6]\n"
+ "fmla v16.8h, v25.8h, v2.h[6]\n"
+ "fmla v20.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x17, #0x1a0]\n"
+ "fmla v9.8h, v24.8h, v0.h[6]\n"
+ "fmla v13.8h, v24.8h, v1.h[6]\n"
+ "fmla v17.8h, v24.8h, v2.h[6]\n"
+ "fmla v21.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x17, #0x1b0]\n"
+ "fmla v10.8h, v25.8h, v0.h[6]\n"
+ "fmla v14.8h, v25.8h, v1.h[6]\n"
+ "fmla v18.8h, v25.8h, v2.h[6]\n"
+ "fmla v22.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x17, #0x1c0]\n"
+ "fmla v11.8h, v24.8h, v0.h[6]\n"
+ "fmla v15.8h, v24.8h, v1.h[6]\n"
+ "fmla v19.8h, v24.8h, v2.h[6]\n"
+ "fmla v23.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x17, #0x1d0]\n"
+ "fmla v8.8h, v25.8h, v0.h[7]\n"
+ "fmla v12.8h, v25.8h, v1.h[7]\n"
+ "fmla v16.8h, v25.8h, v2.h[7]\n"
+ "fmla v20.8h, v25.8h, v3.h[7]\n"
+ "ldr q25, [x17, #0x1e0]\n"
+ "fmla v9.8h, v24.8h, v0.h[7]\n"
+ "fmla v13.8h, v24.8h, v1.h[7]\n"
+ "fmla v17.8h, v24.8h, v2.h[7]\n"
+ "fmla v21.8h, v24.8h, v3.h[7]\n"
+ "ldr q24, [x17, #0x1f0]\n"
+ "fmla v10.8h, v25.8h, v0.h[7]\n"
"add x17, x17, #0x200\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v14.8h, v25.8h, v1.h[7]\n"
+ "fmla v18.8h, v25.8h, v2.h[7]\n"
+ "fmla v22.8h, v25.8h, v3.h[7]\n"
+ "fmla v11.8h, v24.8h, v0.h[7]\n"
+ "fmla v15.8h, v24.8h, v1.h[7]\n"
+ "fmla v19.8h, v24.8h, v2.h[7]\n"
+ "fmla v23.8h, v24.8h, v3.h[7]\n"
"175:" // Height 4: Multiply loop: Main loop skip
"cbz x14, 177f\n"
"176:" // Height 4: Multiply loop: Odd block loop
- "ldr h0, [x13], #0x2\n"
+ "ldr h3, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr h2, [x12], #0x2\n"
+ "ldr h1, [x11], #0x2\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr q25, [x17, #0x0]\n"
+ "fmla v8.8h, v25.8h, v3.h[0]\n"
+ "ldr q24, [x17, #0x10]\n"
+ "fmla v12.8h, v25.8h, v2.h[0]\n"
+ "fmla v16.8h, v25.8h, v1.h[0]\n"
+ "fmla v20.8h, v25.8h, v0.h[0]\n"
+ "ldr q25, [x17, #0x20]\n"
+ "fmla v9.8h, v24.8h, v3.h[0]\n"
+ "fmla v13.8h, v24.8h, v2.h[0]\n"
+ "fmla v17.8h, v24.8h, v1.h[0]\n"
+ "fmla v21.8h, v24.8h, v0.h[0]\n"
+ "ldr q24, [x17, #0x30]\n"
+ "fmla v10.8h, v25.8h, v3.h[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v14.8h, v25.8h, v2.h[0]\n"
+ "fmla v18.8h, v25.8h, v1.h[0]\n"
+ "fmla v22.8h, v25.8h, v0.h[0]\n"
+ "fmla v11.8h, v24.8h, v3.h[0]\n"
+ "fmla v15.8h, v24.8h, v2.h[0]\n"
+ "fmla v19.8h, v24.8h, v1.h[0]\n"
+ "fmla v23.8h, v24.8h, v0.h[0]\n"
"cbnz x14, 176b\n"
"177:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2918,41 +2918,41 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 178f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v0.8h\n"
- "fmin v9.8h, v9.8h, v0.8h\n"
- "fmin v10.8h, v10.8h, v0.8h\n"
- "fmin v11.8h, v11.8h, v0.8h\n"
- "fmin v12.8h, v12.8h, v0.8h\n"
- "fmin v13.8h, v13.8h, v0.8h\n"
- "fmin v14.8h, v14.8h, v0.8h\n"
- "fmin v15.8h, v15.8h, v0.8h\n"
- "fmin v16.8h, v16.8h, v0.8h\n"
- "fmin v17.8h, v17.8h, v0.8h\n"
- "fmin v18.8h, v18.8h, v0.8h\n"
- "fmin v19.8h, v19.8h, v0.8h\n"
- "fmin v20.8h, v20.8h, v0.8h\n"
- "fmin v21.8h, v21.8h, v0.8h\n"
- "fmin v22.8h, v22.8h, v0.8h\n"
- "fmin v23.8h, v23.8h, v0.8h\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v24.8h\n"
+ "fmin v9.8h, v9.8h, v24.8h\n"
+ "fmin v10.8h, v10.8h, v24.8h\n"
+ "fmin v11.8h, v11.8h, v24.8h\n"
+ "fmin v12.8h, v12.8h, v24.8h\n"
+ "fmin v13.8h, v13.8h, v24.8h\n"
+ "fmin v14.8h, v14.8h, v24.8h\n"
+ "fmin v15.8h, v15.8h, v24.8h\n"
+ "fmin v16.8h, v16.8h, v24.8h\n"
+ "fmin v17.8h, v17.8h, v24.8h\n"
+ "fmin v18.8h, v18.8h, v24.8h\n"
+ "fmin v19.8h, v19.8h, v24.8h\n"
+ "fmin v20.8h, v20.8h, v24.8h\n"
+ "fmin v21.8h, v21.8h, v24.8h\n"
+ "fmin v22.8h, v22.8h, v24.8h\n"
+ "fmin v23.8h, v23.8h, v24.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
- "fmax v20.8h, v20.8h, v0.8h\n"
- "fmax v21.8h, v21.8h, v0.8h\n"
- "fmax v22.8h, v22.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v0.8h\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "fmax v8.8h, v8.8h, v24.8h\n"
+ "fmax v9.8h, v9.8h, v24.8h\n"
+ "fmax v10.8h, v10.8h, v24.8h\n"
+ "fmax v11.8h, v11.8h, v24.8h\n"
+ "fmax v12.8h, v12.8h, v24.8h\n"
+ "fmax v13.8h, v13.8h, v24.8h\n"
+ "fmax v14.8h, v14.8h, v24.8h\n"
+ "fmax v15.8h, v15.8h, v24.8h\n"
+ "fmax v16.8h, v16.8h, v24.8h\n"
+ "fmax v17.8h, v17.8h, v24.8h\n"
+ "fmax v18.8h, v18.8h, v24.8h\n"
+ "fmax v19.8h, v19.8h, v24.8h\n"
+ "fmax v20.8h, v20.8h, v24.8h\n"
+ "fmax v21.8h, v21.8h, v24.8h\n"
+ "fmax v22.8h, v22.8h, v24.8h\n"
+ "fmax v23.8h, v23.8h, v24.8h\n"
"178:" // Height 4: No activation
"cmp x8, #0x20\n"
"bge 195f\n"
@@ -3493,564 +3493,564 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"219:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 220f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
"cbnz x15, 221f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n"
+ "add x11, x11, x20, LSL #1\n"
+ "add x10, x10, x20, LSL #1\n"
"add x9, x9, x20, LSL #1\n"
- "add x27, x27, x20, LSL #1\n"
- "add x25, x25, x20, LSL #1\n"
- "add x23, x23, x20, LSL #1\n"
"b 221f\n"
"220:" // Height 5: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #1\n"
- "add x27, x9, x20, LSL #1\n"
- "add x25, x27, x20, LSL #1\n"
- "add x23, x25, x20, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
+ "add x11, x12, x21, LSL #1\n"
+ "add x10, x11, x21, LSL #1\n"
+ "add x9, x10, x21, LSL #1\n"
"221:" // Height 5: input setup done
"cmp x14, #0x8\n"
"blt 224f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 223f\n"
"222:" // Height 5: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d29, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v29.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x58]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "add x23, x23, #0x10\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr x10, [x13, #0x8]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr x28, [x9, #0x8]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr x26, [x27, #0x8]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr x24, [x25, #0x8]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr x22, [x23, #0x8]\n"
- "fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr d28, [x17, #0x30]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[0]\n"
+ "fmla v14.8h, v29.8h, v1.h[0]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.8h, v29.8h, v3.h[0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ "fmla v26.8h, v29.8h, v4.h[0]\n"
+ "ldr d29, [x17, #0x40]\n"
+ "fmla v11.8h, v28.8h, v0.h[0]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[0]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "ldr x25, [x12, #0x8]\n"
+ "fmla v23.8h, v28.8h, v3.h[0]\n"
+ "ldr x24, [x11, #0x8]\n"
+ "fmla v27.8h, v28.8h, v4.h[0]\n"
+ "ldr d28, [x17, #0x50]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[1]\n"
+ "fmla v12.8h, v29.8h, v1.h[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v16.8h, v29.8h, v2.h[1]\n"
+ "ldr x23, [x10, #0x8]\n"
+ "fmla v20.8h, v29.8h, v3.h[1]\n"
+ "ldr x22, [x9, #0x8]\n"
+ "fmla v24.8h, v29.8h, v4.h[1]\n"
+ "ldr d29, [x17, #0x60]\n"
+ "fmla v9.8h, v28.8h, v0.h[1]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla v17.8h, v28.8h, v2.h[1]\n"
"sub x14, x14, #0x8\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v21.8h, v28.8h, v3.h[1]\n"
"cmp x14, #0x10\n"
- "fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x98]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v25.8h, v28.8h, v4.h[1]\n"
+ "ldr d28, [x17, #0x70]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[1]\n"
+ "fmla v14.8h, v29.8h, v1.h[1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla v18.8h, v29.8h, v2.h[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v22.8h, v29.8h, v3.h[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v26.8h, v29.8h, v4.h[1]\n"
+ "ldr d29, [x17, #0x80]\n"
+ "fmla v11.8h, v28.8h, v0.h[1]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v19.8h, v28.8h, v2.h[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v23.8h, v28.8h, v3.h[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v27.8h, v28.8h, v4.h[1]\n"
+ "ldr d28, [x17, #0x90]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[2]\n"
+ "fmla v12.8h, v29.8h, v1.h[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v16.8h, v29.8h, v2.h[2]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- "fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xd8]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x108]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0x118]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr d6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x128]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr d7, [x17, #0x110]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x138]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr d6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x148]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr d7, [x17, #0x130]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x158]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr d6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x168]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr d7, [x17, #0x150]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x178]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr d6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x188]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr d7, [x17, #0x170]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x198]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr d6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x1a8]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr d7, [x17, #0x190]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1b8]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr d6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1c8]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr d7, [x17, #0x1b0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1d8]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr d6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1e8]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr d7, [x17, #0x1d0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x1f8]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr d6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr d7, [x17, #0x1f0]\n"
- "mov v7.d[1], x11\n"
+ "fmla v20.8h, v29.8h, v3.h[2]\n"
+ "fmla v24.8h, v29.8h, v4.h[2]\n"
+ "ldr d29, [x17, #0xa0]\n"
+ "fmla v9.8h, v28.8h, v0.h[2]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla v17.8h, v28.8h, v2.h[2]\n"
+ "fmla v21.8h, v28.8h, v3.h[2]\n"
+ "fmla v25.8h, v28.8h, v4.h[2]\n"
+ "ldr d28, [x17, #0xb0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[2]\n"
+ "fmla v14.8h, v29.8h, v1.h[2]\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "fmla v18.8h, v29.8h, v2.h[2]\n"
+ "fmla v22.8h, v29.8h, v3.h[2]\n"
+ "fmla v26.8h, v29.8h, v4.h[2]\n"
+ "ldr d29, [x17, #0xc0]\n"
+ "fmla v11.8h, v28.8h, v0.h[2]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v19.8h, v28.8h, v2.h[2]\n"
+ "fmla v23.8h, v28.8h, v3.h[2]\n"
+ "fmla v27.8h, v28.8h, v4.h[2]\n"
+ "ldr d28, [x17, #0xd0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[3]\n"
+ "fmla v12.8h, v29.8h, v1.h[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v16.8h, v29.8h, v2.h[3]\n"
+ "fmla v20.8h, v29.8h, v3.h[3]\n"
+ "fmla v24.8h, v29.8h, v4.h[3]\n"
+ "ldr d29, [x17, #0xe0]\n"
+ "fmla v9.8h, v28.8h, v0.h[3]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x108]\n"
+ "fmla v17.8h, v28.8h, v2.h[3]\n"
+ "fmla v21.8h, v28.8h, v3.h[3]\n"
+ "fmla v25.8h, v28.8h, v4.h[3]\n"
+ "ldr d28, [x17, #0xf0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[3]\n"
+ "fmla v14.8h, v29.8h, v1.h[3]\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla v18.8h, v29.8h, v2.h[3]\n"
+ "fmla v22.8h, v29.8h, v3.h[3]\n"
+ "fmla v26.8h, v29.8h, v4.h[3]\n"
+ "ldr d29, [x17, #0x100]\n"
+ "fmla v11.8h, v28.8h, v0.h[3]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[3]\n"
+ "ldr x21, [x17, #0x128]\n"
+ "fmla v19.8h, v28.8h, v2.h[3]\n"
+ "fmla v23.8h, v28.8h, v3.h[3]\n"
+ "fmla v27.8h, v28.8h, v4.h[3]\n"
+ "ldr d28, [x17, #0x110]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[4]\n"
+ "fmla v12.8h, v29.8h, v1.h[4]\n"
+ "ldr x20, [x17, #0x138]\n"
+ "fmla v16.8h, v29.8h, v2.h[4]\n"
+ "fmla v20.8h, v29.8h, v3.h[4]\n"
+ "fmla v24.8h, v29.8h, v4.h[4]\n"
+ "ldr d29, [x17, #0x120]\n"
+ "fmla v9.8h, v28.8h, v0.h[4]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x148]\n"
+ "fmla v17.8h, v28.8h, v2.h[4]\n"
+ "fmla v21.8h, v28.8h, v3.h[4]\n"
+ "fmla v25.8h, v28.8h, v4.h[4]\n"
+ "ldr d28, [x17, #0x130]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[4]\n"
+ "fmla v14.8h, v29.8h, v1.h[4]\n"
+ "ldr x20, [x17, #0x158]\n"
+ "fmla v18.8h, v29.8h, v2.h[4]\n"
+ "fmla v22.8h, v29.8h, v3.h[4]\n"
+ "fmla v26.8h, v29.8h, v4.h[4]\n"
+ "ldr d29, [x17, #0x140]\n"
+ "fmla v11.8h, v28.8h, v0.h[4]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[4]\n"
+ "ldr x21, [x17, #0x168]\n"
+ "fmla v19.8h, v28.8h, v2.h[4]\n"
+ "fmla v23.8h, v28.8h, v3.h[4]\n"
+ "fmla v27.8h, v28.8h, v4.h[4]\n"
+ "ldr d28, [x17, #0x150]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[5]\n"
+ "fmla v12.8h, v29.8h, v1.h[5]\n"
+ "ldr x20, [x17, #0x178]\n"
+ "fmla v16.8h, v29.8h, v2.h[5]\n"
+ "fmla v20.8h, v29.8h, v3.h[5]\n"
+ "fmla v24.8h, v29.8h, v4.h[5]\n"
+ "ldr d29, [x17, #0x160]\n"
+ "fmla v9.8h, v28.8h, v0.h[5]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x188]\n"
+ "fmla v17.8h, v28.8h, v2.h[5]\n"
+ "fmla v21.8h, v28.8h, v3.h[5]\n"
+ "fmla v25.8h, v28.8h, v4.h[5]\n"
+ "ldr d28, [x17, #0x170]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[5]\n"
+ "fmla v14.8h, v29.8h, v1.h[5]\n"
+ "ldr x20, [x17, #0x198]\n"
+ "fmla v18.8h, v29.8h, v2.h[5]\n"
+ "fmla v22.8h, v29.8h, v3.h[5]\n"
+ "fmla v26.8h, v29.8h, v4.h[5]\n"
+ "ldr d29, [x17, #0x180]\n"
+ "fmla v11.8h, v28.8h, v0.h[5]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[5]\n"
+ "ldr x21, [x17, #0x1a8]\n"
+ "fmla v19.8h, v28.8h, v2.h[5]\n"
+ "fmla v23.8h, v28.8h, v3.h[5]\n"
+ "fmla v27.8h, v28.8h, v4.h[5]\n"
+ "ldr d28, [x17, #0x190]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[6]\n"
+ "fmla v12.8h, v29.8h, v1.h[6]\n"
+ "ldr x20, [x17, #0x1b8]\n"
+ "fmla v16.8h, v29.8h, v2.h[6]\n"
+ "fmla v20.8h, v29.8h, v3.h[6]\n"
+ "fmla v24.8h, v29.8h, v4.h[6]\n"
+ "ldr d29, [x17, #0x1a0]\n"
+ "fmla v9.8h, v28.8h, v0.h[6]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1c8]\n"
+ "fmla v17.8h, v28.8h, v2.h[6]\n"
+ "fmla v21.8h, v28.8h, v3.h[6]\n"
+ "fmla v25.8h, v28.8h, v4.h[6]\n"
+ "ldr d28, [x17, #0x1b0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.8h, v29.8h, v0.h[6]\n"
+ "fmla v14.8h, v29.8h, v1.h[6]\n"
+ "ldr x20, [x17, #0x1d8]\n"
+ "fmla v18.8h, v29.8h, v2.h[6]\n"
+ "fmla v22.8h, v29.8h, v3.h[6]\n"
+ "fmla v26.8h, v29.8h, v4.h[6]\n"
+ "ldr d29, [x17, #0x1c0]\n"
+ "fmla v11.8h, v28.8h, v0.h[6]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.8h, v28.8h, v1.h[6]\n"
+ "ldr x21, [x17, #0x1e8]\n"
+ "fmla v19.8h, v28.8h, v2.h[6]\n"
+ "fmla v23.8h, v28.8h, v3.h[6]\n"
+ "fmla v27.8h, v28.8h, v4.h[6]\n"
+ "ldr d28, [x17, #0x1d0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.8h, v29.8h, v0.h[7]\n"
+ "fmla v12.8h, v29.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x1f8]\n"
+ "fmla v16.8h, v29.8h, v2.h[7]\n"
+ "fmla v20.8h, v29.8h, v3.h[7]\n"
+ "fmla v24.8h, v29.8h, v4.h[7]\n"
+ "ldr d29, [x17, #0x1e0]\n"
+ "fmla v9.8h, v28.8h, v0.h[7]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.8h, v28.8h, v1.h[7]\n"
+ "fmla v17.8h, v28.8h, v2.h[7]\n"
+ "fmla v21.8h, v28.8h, v3.h[7]\n"
+ "fmla v25.8h, v28.8h, v4.h[7]\n"
+ "ldr d28, [x17, #0x1f0]\n"
+ "mov v28.d[1], x20\n"
"add x17, x17, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x18]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v10.8h, v29.8h, v0.h[7]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "ldr x20, [x17, #0x18]\n"
+ "fmla v18.8h, v29.8h, v2.h[7]\n"
+ "fmla v22.8h, v29.8h, v3.h[7]\n"
+ "fmla v26.8h, v29.8h, v4.h[7]\n"
"ldr d6, [x17, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v28.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "ldr d1, [x9, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "fmla v27.8h, v7.8h, v4.h[7]\n"
- "ldr d4, [x23, #0x0]\n"
+ "fmla v15.8h, v28.8h, v1.h[7]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "fmla v19.8h, v28.8h, v2.h[7]\n"
+ "ldr d2, [x11, #0x0]\n"
+ "fmla v23.8h, v28.8h, v3.h[7]\n"
+ "ldr d3, [x10, #0x0]\n"
+ "fmla v27.8h, v28.8h, v4.h[7]\n"
+ "ldr d4, [x9, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x26\n"
+ "mov v1.d[1], x25\n"
+ "mov v2.d[1], x24\n"
+ "mov v3.d[1], x23\n"
"mov v4.d[1], x22\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"bge 222b\n"
"223:" // Height 5: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q29, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"sub x14, x14, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x17, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x17, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x17, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x17, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x17, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x17, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x17, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x17, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x17, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x17, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x17, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x17, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x17, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x17, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x17, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x17, #0x1f0]\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "ldr q28, [x17, #0x30]\n"
+ "fmla v10.8h, v29.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v14.8h, v29.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v22.8h, v29.8h, v3.h[0]\n"
+ "fmla v26.8h, v29.8h, v4.h[0]\n"
+ "ldr q29, [x17, #0x40]\n"
+ "fmla v11.8h, v28.8h, v0.h[0]\n"
+ "fmla v15.8h, v28.8h, v1.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v3.h[0]\n"
+ "fmla v27.8h, v28.8h, v4.h[0]\n"
+ "ldr q28, [x17, #0x50]\n"
+ "fmla v8.8h, v29.8h, v0.h[1]\n"
+ "fmla v12.8h, v29.8h, v1.h[1]\n"
+ "fmla v16.8h, v29.8h, v2.h[1]\n"
+ "fmla v20.8h, v29.8h, v3.h[1]\n"
+ "fmla v24.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x17, #0x60]\n"
+ "fmla v9.8h, v28.8h, v0.h[1]\n"
+ "fmla v13.8h, v28.8h, v1.h[1]\n"
+ "fmla v17.8h, v28.8h, v2.h[1]\n"
+ "fmla v21.8h, v28.8h, v3.h[1]\n"
+ "fmla v25.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x17, #0x70]\n"
+ "fmla v10.8h, v29.8h, v0.h[1]\n"
+ "fmla v14.8h, v29.8h, v1.h[1]\n"
+ "fmla v18.8h, v29.8h, v2.h[1]\n"
+ "fmla v22.8h, v29.8h, v3.h[1]\n"
+ "fmla v26.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x17, #0x80]\n"
+ "fmla v11.8h, v28.8h, v0.h[1]\n"
+ "fmla v15.8h, v28.8h, v1.h[1]\n"
+ "fmla v19.8h, v28.8h, v2.h[1]\n"
+ "fmla v23.8h, v28.8h, v3.h[1]\n"
+ "fmla v27.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x17, #0x90]\n"
+ "fmla v8.8h, v29.8h, v0.h[2]\n"
+ "fmla v12.8h, v29.8h, v1.h[2]\n"
+ "fmla v16.8h, v29.8h, v2.h[2]\n"
+ "fmla v20.8h, v29.8h, v3.h[2]\n"
+ "fmla v24.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x17, #0xa0]\n"
+ "fmla v9.8h, v28.8h, v0.h[2]\n"
+ "fmla v13.8h, v28.8h, v1.h[2]\n"
+ "fmla v17.8h, v28.8h, v2.h[2]\n"
+ "fmla v21.8h, v28.8h, v3.h[2]\n"
+ "fmla v25.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x17, #0xb0]\n"
+ "fmla v10.8h, v29.8h, v0.h[2]\n"
+ "fmla v14.8h, v29.8h, v1.h[2]\n"
+ "fmla v18.8h, v29.8h, v2.h[2]\n"
+ "fmla v22.8h, v29.8h, v3.h[2]\n"
+ "fmla v26.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x17, #0xc0]\n"
+ "fmla v11.8h, v28.8h, v0.h[2]\n"
+ "fmla v15.8h, v28.8h, v1.h[2]\n"
+ "fmla v19.8h, v28.8h, v2.h[2]\n"
+ "fmla v23.8h, v28.8h, v3.h[2]\n"
+ "fmla v27.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x17, #0xd0]\n"
+ "fmla v8.8h, v29.8h, v0.h[3]\n"
+ "fmla v12.8h, v29.8h, v1.h[3]\n"
+ "fmla v16.8h, v29.8h, v2.h[3]\n"
+ "fmla v20.8h, v29.8h, v3.h[3]\n"
+ "fmla v24.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x17, #0xe0]\n"
+ "fmla v9.8h, v28.8h, v0.h[3]\n"
+ "fmla v13.8h, v28.8h, v1.h[3]\n"
+ "fmla v17.8h, v28.8h, v2.h[3]\n"
+ "fmla v21.8h, v28.8h, v3.h[3]\n"
+ "fmla v25.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x17, #0xf0]\n"
+ "fmla v10.8h, v29.8h, v0.h[3]\n"
+ "fmla v14.8h, v29.8h, v1.h[3]\n"
+ "fmla v18.8h, v29.8h, v2.h[3]\n"
+ "fmla v22.8h, v29.8h, v3.h[3]\n"
+ "fmla v26.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x17, #0x100]\n"
+ "fmla v11.8h, v28.8h, v0.h[3]\n"
+ "fmla v15.8h, v28.8h, v1.h[3]\n"
+ "fmla v19.8h, v28.8h, v2.h[3]\n"
+ "fmla v23.8h, v28.8h, v3.h[3]\n"
+ "fmla v27.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x17, #0x110]\n"
+ "fmla v8.8h, v29.8h, v0.h[4]\n"
+ "fmla v12.8h, v29.8h, v1.h[4]\n"
+ "fmla v16.8h, v29.8h, v2.h[4]\n"
+ "fmla v20.8h, v29.8h, v3.h[4]\n"
+ "fmla v24.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x17, #0x120]\n"
+ "fmla v9.8h, v28.8h, v0.h[4]\n"
+ "fmla v13.8h, v28.8h, v1.h[4]\n"
+ "fmla v17.8h, v28.8h, v2.h[4]\n"
+ "fmla v21.8h, v28.8h, v3.h[4]\n"
+ "fmla v25.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x17, #0x130]\n"
+ "fmla v10.8h, v29.8h, v0.h[4]\n"
+ "fmla v14.8h, v29.8h, v1.h[4]\n"
+ "fmla v18.8h, v29.8h, v2.h[4]\n"
+ "fmla v22.8h, v29.8h, v3.h[4]\n"
+ "fmla v26.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x17, #0x140]\n"
+ "fmla v11.8h, v28.8h, v0.h[4]\n"
+ "fmla v15.8h, v28.8h, v1.h[4]\n"
+ "fmla v19.8h, v28.8h, v2.h[4]\n"
+ "fmla v23.8h, v28.8h, v3.h[4]\n"
+ "fmla v27.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x17, #0x150]\n"
+ "fmla v8.8h, v29.8h, v0.h[5]\n"
+ "fmla v12.8h, v29.8h, v1.h[5]\n"
+ "fmla v16.8h, v29.8h, v2.h[5]\n"
+ "fmla v20.8h, v29.8h, v3.h[5]\n"
+ "fmla v24.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x17, #0x160]\n"
+ "fmla v9.8h, v28.8h, v0.h[5]\n"
+ "fmla v13.8h, v28.8h, v1.h[5]\n"
+ "fmla v17.8h, v28.8h, v2.h[5]\n"
+ "fmla v21.8h, v28.8h, v3.h[5]\n"
+ "fmla v25.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x17, #0x170]\n"
+ "fmla v10.8h, v29.8h, v0.h[5]\n"
+ "fmla v14.8h, v29.8h, v1.h[5]\n"
+ "fmla v18.8h, v29.8h, v2.h[5]\n"
+ "fmla v22.8h, v29.8h, v3.h[5]\n"
+ "fmla v26.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x17, #0x180]\n"
+ "fmla v11.8h, v28.8h, v0.h[5]\n"
+ "fmla v15.8h, v28.8h, v1.h[5]\n"
+ "fmla v19.8h, v28.8h, v2.h[5]\n"
+ "fmla v23.8h, v28.8h, v3.h[5]\n"
+ "fmla v27.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x17, #0x190]\n"
+ "fmla v8.8h, v29.8h, v0.h[6]\n"
+ "fmla v12.8h, v29.8h, v1.h[6]\n"
+ "fmla v16.8h, v29.8h, v2.h[6]\n"
+ "fmla v20.8h, v29.8h, v3.h[6]\n"
+ "fmla v24.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x17, #0x1a0]\n"
+ "fmla v9.8h, v28.8h, v0.h[6]\n"
+ "fmla v13.8h, v28.8h, v1.h[6]\n"
+ "fmla v17.8h, v28.8h, v2.h[6]\n"
+ "fmla v21.8h, v28.8h, v3.h[6]\n"
+ "fmla v25.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x17, #0x1b0]\n"
+ "fmla v10.8h, v29.8h, v0.h[6]\n"
+ "fmla v14.8h, v29.8h, v1.h[6]\n"
+ "fmla v18.8h, v29.8h, v2.h[6]\n"
+ "fmla v22.8h, v29.8h, v3.h[6]\n"
+ "fmla v26.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x17, #0x1c0]\n"
+ "fmla v11.8h, v28.8h, v0.h[6]\n"
+ "fmla v15.8h, v28.8h, v1.h[6]\n"
+ "fmla v19.8h, v28.8h, v2.h[6]\n"
+ "fmla v23.8h, v28.8h, v3.h[6]\n"
+ "fmla v27.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x17, #0x1d0]\n"
+ "fmla v8.8h, v29.8h, v0.h[7]\n"
+ "fmla v12.8h, v29.8h, v1.h[7]\n"
+ "fmla v16.8h, v29.8h, v2.h[7]\n"
+ "fmla v20.8h, v29.8h, v3.h[7]\n"
+ "fmla v24.8h, v29.8h, v4.h[7]\n"
+ "ldr q29, [x17, #0x1e0]\n"
+ "fmla v9.8h, v28.8h, v0.h[7]\n"
+ "fmla v13.8h, v28.8h, v1.h[7]\n"
+ "fmla v17.8h, v28.8h, v2.h[7]\n"
+ "fmla v21.8h, v28.8h, v3.h[7]\n"
+ "fmla v25.8h, v28.8h, v4.h[7]\n"
+ "ldr q28, [x17, #0x1f0]\n"
+ "fmla v10.8h, v29.8h, v0.h[7]\n"
"add x17, x17, #0x200\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v26.8h, v6.8h, v4.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
- "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "fmla v18.8h, v29.8h, v2.h[7]\n"
+ "fmla v22.8h, v29.8h, v3.h[7]\n"
+ "fmla v26.8h, v29.8h, v4.h[7]\n"
+ "fmla v11.8h, v28.8h, v0.h[7]\n"
+ "fmla v15.8h, v28.8h, v1.h[7]\n"
+ "fmla v19.8h, v28.8h, v2.h[7]\n"
+ "fmla v23.8h, v28.8h, v3.h[7]\n"
+ "fmla v27.8h, v28.8h, v4.h[7]\n"
"224:" // Height 5: Multiply loop: Main loop skip
"cbz x14, 226f\n"
"225:" // Height 5: Multiply loop: Odd block loop
- "ldr h0, [x13], #0x2\n"
+ "ldr h4, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr h3, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h1, [x10], #0x2\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr q29, [x17, #0x0]\n"
+ "fmla v8.8h, v29.8h, v4.h[0]\n"
+ "ldr q28, [x17, #0x10]\n"
+ "fmla v12.8h, v29.8h, v3.h[0]\n"
+ "fmla v16.8h, v29.8h, v2.h[0]\n"
+ "fmla v20.8h, v29.8h, v1.h[0]\n"
+ "fmla v24.8h, v29.8h, v0.h[0]\n"
+ "ldr q29, [x17, #0x20]\n"
+ "fmla v9.8h, v28.8h, v4.h[0]\n"
+ "fmla v13.8h, v28.8h, v3.h[0]\n"
+ "fmla v17.8h, v28.8h, v2.h[0]\n"
+ "fmla v21.8h, v28.8h, v1.h[0]\n"
+ "fmla v25.8h, v28.8h, v0.h[0]\n"
+ "ldr q28, [x17, #0x30]\n"
+ "fmla v10.8h, v29.8h, v4.h[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v14.8h, v29.8h, v3.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v1.h[0]\n"
+ "fmla v26.8h, v29.8h, v0.h[0]\n"
+ "fmla v11.8h, v28.8h, v4.h[0]\n"
+ "fmla v15.8h, v28.8h, v3.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v1.h[0]\n"
+ "fmla v27.8h, v28.8h, v0.h[0]\n"
"cbnz x14, 225b\n"
"226:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -4069,49 +4069,49 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 227f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v0.8h\n"
- "fmin v9.8h, v9.8h, v0.8h\n"
- "fmin v10.8h, v10.8h, v0.8h\n"
- "fmin v11.8h, v11.8h, v0.8h\n"
- "fmin v12.8h, v12.8h, v0.8h\n"
- "fmin v13.8h, v13.8h, v0.8h\n"
- "fmin v14.8h, v14.8h, v0.8h\n"
- "fmin v15.8h, v15.8h, v0.8h\n"
- "fmin v16.8h, v16.8h, v0.8h\n"
- "fmin v17.8h, v17.8h, v0.8h\n"
- "fmin v18.8h, v18.8h, v0.8h\n"
- "fmin v19.8h, v19.8h, v0.8h\n"
- "fmin v20.8h, v20.8h, v0.8h\n"
- "fmin v21.8h, v21.8h, v0.8h\n"
- "fmin v22.8h, v22.8h, v0.8h\n"
- "fmin v23.8h, v23.8h, v0.8h\n"
- "fmin v24.8h, v24.8h, v0.8h\n"
- "fmin v25.8h, v25.8h, v0.8h\n"
- "fmin v26.8h, v26.8h, v0.8h\n"
- "fmin v27.8h, v27.8h, v0.8h\n"
+ "ld1r { v28.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v28.8h\n"
+ "fmin v9.8h, v9.8h, v28.8h\n"
+ "fmin v10.8h, v10.8h, v28.8h\n"
+ "fmin v11.8h, v11.8h, v28.8h\n"
+ "fmin v12.8h, v12.8h, v28.8h\n"
+ "fmin v13.8h, v13.8h, v28.8h\n"
+ "fmin v14.8h, v14.8h, v28.8h\n"
+ "fmin v15.8h, v15.8h, v28.8h\n"
+ "fmin v16.8h, v16.8h, v28.8h\n"
+ "fmin v17.8h, v17.8h, v28.8h\n"
+ "fmin v18.8h, v18.8h, v28.8h\n"
+ "fmin v19.8h, v19.8h, v28.8h\n"
+ "fmin v20.8h, v20.8h, v28.8h\n"
+ "fmin v21.8h, v21.8h, v28.8h\n"
+ "fmin v22.8h, v22.8h, v28.8h\n"
+ "fmin v23.8h, v23.8h, v28.8h\n"
+ "fmin v24.8h, v24.8h, v28.8h\n"
+ "fmin v25.8h, v25.8h, v28.8h\n"
+ "fmin v26.8h, v26.8h, v28.8h\n"
+ "fmin v27.8h, v27.8h, v28.8h\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
- "fmax v20.8h, v20.8h, v0.8h\n"
- "fmax v21.8h, v21.8h, v0.8h\n"
- "fmax v22.8h, v22.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v0.8h\n"
- "fmax v24.8h, v24.8h, v0.8h\n"
- "fmax v25.8h, v25.8h, v0.8h\n"
- "fmax v26.8h, v26.8h, v0.8h\n"
- "fmax v27.8h, v27.8h, v0.8h\n"
+ "ld1r { v28.8h }, [x20]\n"
+ "fmax v8.8h, v8.8h, v28.8h\n"
+ "fmax v9.8h, v9.8h, v28.8h\n"
+ "fmax v10.8h, v10.8h, v28.8h\n"
+ "fmax v11.8h, v11.8h, v28.8h\n"
+ "fmax v12.8h, v12.8h, v28.8h\n"
+ "fmax v13.8h, v13.8h, v28.8h\n"
+ "fmax v14.8h, v14.8h, v28.8h\n"
+ "fmax v15.8h, v15.8h, v28.8h\n"
+ "fmax v16.8h, v16.8h, v28.8h\n"
+ "fmax v17.8h, v17.8h, v28.8h\n"
+ "fmax v18.8h, v18.8h, v28.8h\n"
+ "fmax v19.8h, v19.8h, v28.8h\n"
+ "fmax v20.8h, v20.8h, v28.8h\n"
+ "fmax v21.8h, v21.8h, v28.8h\n"
+ "fmax v22.8h, v22.8h, v28.8h\n"
+ "fmax v23.8h, v23.8h, v28.8h\n"
+ "fmax v24.8h, v24.8h, v28.8h\n"
+ "fmax v25.8h, v25.8h, v28.8h\n"
+ "fmax v26.8h, v26.8h, v28.8h\n"
+ "fmax v27.8h, v27.8h, v28.8h\n"
"227:" // Height 5: No activation
"cmp x8, #0x20\n"
"bge 244f\n"
@@ -4736,98 +4736,98 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"268:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 269f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
+ "ldr x28, [x20, #0x28]\n"
"cbnz x15, 270f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n"
+ "add x11, x11, x20, LSL #1\n"
+ "add x10, x10, x20, LSL #1\n"
"add x9, x9, x20, LSL #1\n"
- "add x27, x27, x20, LSL #1\n"
- "add x25, x25, x20, LSL #1\n"
- "add x23, x23, x20, LSL #1\n"
- "add x21, x21, x20, LSL #1\n"
+ "add x28, x28, x20, LSL #1\n"
"b 270f\n"
"269:" // Height 6: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #1\n"
- "add x27, x9, x20, LSL #1\n"
- "add x25, x27, x20, LSL #1\n"
- "add x23, x25, x20, LSL #1\n"
- "add x21, x23, x20, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
+ "add x11, x12, x21, LSL #1\n"
+ "add x10, x11, x21, LSL #1\n"
+ "add x9, x10, x21, LSL #1\n"
+ "add x28, x9, x21, LSL #1\n"
"270:" // Height 6: input setup done
"cmp x14, #0x8\n"
"blt 273f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x10\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x21, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
+ "ldr q5, [x28, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 272f\n"
"271:" // Height 6: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v28.8h, v6.8h, v5.h[0]\n"
"ldr d6, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
"ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr x11, [x17, #0x58]\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr x10, [x13, #0x8]\n"
+ "ldr x27, [x13, #0x8]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x26, [x12, #0x8]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr x25, [x11, #0x8]\n"
"fmla v30.8h, v6.8h, v5.h[0]\n"
"ldr d6, [x17, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr x12, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr x24, [x25, #0x8]\n"
+ "ldr x24, [x10, #0x8]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr x22, [x23, #0x8]\n"
+ "ldr x23, [x9, #0x8]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr x20, [x21, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
"fmla v31.8h, v7.8h, v5.h[0]\n"
"ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x78]\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"sub x14, x14, #0x8\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
@@ -4837,240 +4837,240 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v28.8h, v6.8h, v5.h[1]\n"
"ldr d6, [x17, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0x88]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v25.8h, v7.8h, v4.h[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v29.8h, v7.8h, v5.h[1]\n"
"ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr x11, [x17, #0x98]\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.8h, v6.8h, v4.h[1]\n"
"fmla v30.8h, v6.8h, v5.h[1]\n"
"ldr d6, [x17, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr x12, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
"fmla v27.8h, v7.8h, v4.h[1]\n"
"fmla v31.8h, v7.8h, v5.h[1]\n"
"ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xb8]\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
"fmla v24.8h, v6.8h, v4.h[2]\n"
"fmla v28.8h, v6.8h, v5.h[2]\n"
"ldr d6, [x17, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xc8]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
"fmla v25.8h, v7.8h, v4.h[2]\n"
"fmla v29.8h, v7.8h, v5.h[2]\n"
"ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr x11, [x17, #0xd8]\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
"fmla v26.8h, v6.8h, v4.h[2]\n"
"fmla v30.8h, v6.8h, v5.h[2]\n"
"ldr d6, [x17, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr x12, [x17, #0xe8]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
"fmla v27.8h, v7.8h, v4.h[2]\n"
"fmla v31.8h, v7.8h, v5.h[2]\n"
"ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0xf8]\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
"fmla v24.8h, v6.8h, v4.h[3]\n"
"fmla v28.8h, v6.8h, v5.h[3]\n"
"ldr d6, [x17, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x108]\n"
+ "ldr x21, [x17, #0x108]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
"fmla v25.8h, v7.8h, v4.h[3]\n"
"fmla v29.8h, v7.8h, v5.h[3]\n"
"ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr x11, [x17, #0x118]\n"
+ "ldr x20, [x17, #0x118]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
"fmla v26.8h, v6.8h, v4.h[3]\n"
"fmla v30.8h, v6.8h, v5.h[3]\n"
"ldr d6, [x17, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr x12, [x17, #0x128]\n"
+ "ldr x21, [x17, #0x128]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
"fmla v27.8h, v7.8h, v4.h[3]\n"
"fmla v31.8h, v7.8h, v5.h[3]\n"
"ldr d7, [x17, #0x110]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x138]\n"
+ "ldr x20, [x17, #0x138]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
"fmla v24.8h, v6.8h, v4.h[4]\n"
"fmla v28.8h, v6.8h, v5.h[4]\n"
"ldr d6, [x17, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x148]\n"
+ "ldr x21, [x17, #0x148]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
"fmla v25.8h, v7.8h, v4.h[4]\n"
"fmla v29.8h, v7.8h, v5.h[4]\n"
"ldr d7, [x17, #0x130]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr x11, [x17, #0x158]\n"
+ "ldr x20, [x17, #0x158]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
"fmla v26.8h, v6.8h, v4.h[4]\n"
"fmla v30.8h, v6.8h, v5.h[4]\n"
"ldr d6, [x17, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr x12, [x17, #0x168]\n"
+ "ldr x21, [x17, #0x168]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
"fmla v27.8h, v7.8h, v4.h[4]\n"
"fmla v31.8h, v7.8h, v5.h[4]\n"
"ldr d7, [x17, #0x150]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x178]\n"
+ "ldr x20, [x17, #0x178]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
"fmla v24.8h, v6.8h, v4.h[5]\n"
"fmla v28.8h, v6.8h, v5.h[5]\n"
"ldr d6, [x17, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x188]\n"
+ "ldr x21, [x17, #0x188]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
"fmla v25.8h, v7.8h, v4.h[5]\n"
"fmla v29.8h, v7.8h, v5.h[5]\n"
"ldr d7, [x17, #0x170]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr x11, [x17, #0x198]\n"
+ "ldr x20, [x17, #0x198]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
"fmla v26.8h, v6.8h, v4.h[5]\n"
"fmla v30.8h, v6.8h, v5.h[5]\n"
"ldr d6, [x17, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr x12, [x17, #0x1a8]\n"
+ "ldr x21, [x17, #0x1a8]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
"fmla v27.8h, v7.8h, v4.h[5]\n"
"fmla v31.8h, v7.8h, v5.h[5]\n"
"ldr d7, [x17, #0x190]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1b8]\n"
+ "ldr x20, [x17, #0x1b8]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
"fmla v24.8h, v6.8h, v4.h[6]\n"
"fmla v28.8h, v6.8h, v5.h[6]\n"
"ldr d6, [x17, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1c8]\n"
+ "ldr x21, [x17, #0x1c8]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
"fmla v25.8h, v7.8h, v4.h[6]\n"
"fmla v29.8h, v7.8h, v5.h[6]\n"
"ldr d7, [x17, #0x1b0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr x11, [x17, #0x1d8]\n"
+ "ldr x20, [x17, #0x1d8]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
"fmla v26.8h, v6.8h, v4.h[6]\n"
"fmla v30.8h, v6.8h, v5.h[6]\n"
"ldr d6, [x17, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr x12, [x17, #0x1e8]\n"
+ "ldr x21, [x17, #0x1e8]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
"fmla v27.8h, v7.8h, v4.h[6]\n"
"fmla v31.8h, v7.8h, v5.h[6]\n"
"ldr d7, [x17, #0x1d0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x1f8]\n"
+ "ldr x20, [x17, #0x1f8]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
"fmla v24.8h, v6.8h, v4.h[7]\n"
"fmla v28.8h, v6.8h, v5.h[7]\n"
"ldr d6, [x17, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
"fmla v25.8h, v7.8h, v4.h[7]\n"
"fmla v29.8h, v7.8h, v5.h[7]\n"
"ldr d7, [x17, #0x1f0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"add x17, x17, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
- "ldr x12, [x17, #0x8]\n"
+ "ldr x21, [x17, #0x8]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
- "ldr x11, [x17, #0x18]\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
"fmla v26.8h, v6.8h, v4.h[7]\n"
@@ -5079,56 +5079,56 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"fmla v11.8h, v7.8h, v0.h[7]\n"
"ldr d0, [x13, #0x0]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x12, #0x0]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x11, #0x0]\n"
"fmla v23.8h, v7.8h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
+ "ldr d3, [x10, #0x0]\n"
"fmla v27.8h, v7.8h, v4.h[7]\n"
- "ldr d4, [x23, #0x0]\n"
+ "ldr d4, [x9, #0x0]\n"
"fmla v31.8h, v7.8h, v5.h[7]\n"
- "ldr d5, [x21, #0x0]\n"
+ "ldr d5, [x28, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x26\n"
+ "mov v2.d[1], x25\n"
"mov v3.d[1], x24\n"
- "mov v4.d[1], x22\n"
- "mov v5.d[1], x20\n"
- "mov v7.d[1], x11\n"
+ "mov v4.d[1], x23\n"
+ "mov v5.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 271b\n"
"272:" // Height 6: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v28.8h, v6.8h, v5.h[0]\n"
"ldr q6, [x17, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"sub x14, x14, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
"ldr q7, [x17, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
"fmla v30.8h, v6.8h, v5.h[0]\n"
@@ -5338,42 +5338,42 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"273:" // Height 6: Multiply loop: Main loop skip
"cbz x14, 275f\n"
"274:" // Height 6: Multiply loop: Odd block loop
- "ldr h0, [x13], #0x2\n"
+ "ldr h7, [x13], #0x2\n"
"sub x14, x14, #0x1\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h5, [x21], #0x2\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "fmla v28.8h, v6.8h, v5.h[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "fmla v25.8h, v7.8h, v4.h[0]\n"
- "fmla v29.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr h6, [x12], #0x2\n"
+ "ldr h5, [x11], #0x2\n"
+ "ldr h4, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
+ "ldr h2, [x28], #0x2\n"
+ "ldr q1, [x17, #0x0]\n"
+ "fmla v8.8h, v1.8h, v7.h[0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v12.8h, v1.8h, v6.h[0]\n"
+ "fmla v16.8h, v1.8h, v5.h[0]\n"
+ "fmla v20.8h, v1.8h, v4.h[0]\n"
+ "fmla v24.8h, v1.8h, v3.h[0]\n"
+ "fmla v28.8h, v1.8h, v2.h[0]\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmla v9.8h, v0.8h, v7.h[0]\n"
+ "fmla v13.8h, v0.8h, v6.h[0]\n"
+ "fmla v17.8h, v0.8h, v5.h[0]\n"
+ "fmla v21.8h, v0.8h, v4.h[0]\n"
+ "fmla v25.8h, v0.8h, v3.h[0]\n"
+ "fmla v29.8h, v0.8h, v2.h[0]\n"
+ "ldr q0, [x17, #0x30]\n"
+ "fmla v10.8h, v1.8h, v7.h[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "fmla v30.8h, v6.8h, v5.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "fmla v14.8h, v1.8h, v6.h[0]\n"
+ "fmla v18.8h, v1.8h, v5.h[0]\n"
+ "fmla v22.8h, v1.8h, v4.h[0]\n"
+ "fmla v26.8h, v1.8h, v3.h[0]\n"
+ "fmla v30.8h, v1.8h, v2.h[0]\n"
+ "fmla v11.8h, v0.8h, v7.h[0]\n"
+ "fmla v15.8h, v0.8h, v6.h[0]\n"
+ "fmla v19.8h, v0.8h, v5.h[0]\n"
+ "fmla v23.8h, v0.8h, v4.h[0]\n"
+ "fmla v27.8h, v0.8h, v3.h[0]\n"
+ "fmla v31.8h, v0.8h, v2.h[0]\n"
"cbnz x14, 274b\n"
"275:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -5743,7 +5743,6 @@ void a64_hybrid_fp16_mla_6x32_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"296:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
index 335308751f..8e5f600c83 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -244,11 +244,11 @@ void a64_hybrid_fp16_mla_6x32 (
"23:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 24f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -265,69 +265,69 @@ void a64_hybrid_fp16_mla_6x32 (
"blt 27f\n"
"26:" // Height 1: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x10, #0x110]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x10, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x10, #0x130]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x10, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x10, #0x150]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x10, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x10, #0x170]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x10, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x10, #0x190]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x10, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x10, #0x1b0]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x10, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x10, #0x1d0]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr q17, [x10, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr q16, [x10, #0x1f0]\n"
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
"cmp x27, #0x10\n"
"add x10, x10, #0x200\n"
@@ -337,84 +337,84 @@ void a64_hybrid_fp16_mla_6x32 (
"bge 26b\n"
"27:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "ldr q17, [x10, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "ldr q16, [x10, #0x110]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x10, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x10, #0x130]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "ldr q17, [x10, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "ldr q16, [x10, #0x150]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x10, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x10, #0x170]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "ldr q17, [x10, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "ldr q16, [x10, #0x190]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x10, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x10, #0x1b0]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "ldr q17, [x10, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "ldr q16, [x10, #0x1d0]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "ldr q17, [x10, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "ldr q16, [x10, #0x1f0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x8\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x200\n"
"28:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 30f\n"
"29:" // Height 1: Multiply loop: Odd block loop
"ldr h0, [x26], #0x2\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v8.8h, v16.8h, v0.h[0]\n"
"sub x27, x27, #0x1\n"
- "ldr q7, [x10, #0x10]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "ldr q16, [x10, #0x20]\n"
+ "fmla v9.8h, v17.8h, v0.h[0]\n"
+ "fmla v10.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
"add x10, x10, #0x40\n"
"cbnz x27, 29b\n"
"30:" // Height 1: Multiply loop: No odd multiplies
@@ -425,17 +425,17 @@ void a64_hybrid_fp16_mla_6x32 (
"prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 31f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v17.8h\n"
+ "fmin v9.8h, v9.8h, v17.8h\n"
+ "fmin v10.8h, v10.8h, v17.8h\n"
+ "fmin v11.8h, v11.8h, v17.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v16.8h\n"
+ "fmax v11.8h, v11.8h, v16.8h\n"
"31:" // Height 1: No activation
"cmp x11, #0x20\n"
"bge 48f\n"
@@ -733,12 +733,12 @@ void a64_hybrid_fp16_mla_6x32 (
"72:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 74f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -746,7 +746,7 @@ void a64_hybrid_fp16_mla_6x32 (
"b 74f\n"
"73:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"74:" // Height 2: input setup done
"cmp x27, #0x8\n"
"blt 77f\n"
@@ -759,230 +759,230 @@ void a64_hybrid_fp16_mla_6x32 (
"75:" // Height 2: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"sub x27, x27, #0x8\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "fmla v14.8h, v17.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"add x25, x25, #0x10\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "fmla v15.8h, v16.8h, v1.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"cmp x27, #0x10\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "fmla v12.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "fmla v13.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "fmla v14.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "fmla v15.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "fmla v12.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "fmla v13.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "fmla v14.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "fmla v15.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "fmla v12.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "fmla v13.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "fmla v14.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "fmla v15.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x10, #0x110]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "fmla v12.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x10, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "fmla v13.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x10, #0x130]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "fmla v14.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x10, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "fmla v15.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x10, #0x150]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "fmla v12.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x10, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "fmla v13.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x10, #0x170]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "fmla v14.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x10, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "fmla v15.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x10, #0x190]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "fmla v12.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x10, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "fmla v13.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x10, #0x1b0]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "fmla v14.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x10, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "fmla v15.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x10, #0x1d0]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "fmla v12.8h, v17.8h, v1.h[7]\n"
+ "ldr q17, [x10, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "fmla v13.8h, v16.8h, v1.h[7]\n"
+ "ldr q16, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v14.8h, v17.8h, v1.h[7]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v16.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 75b\n"
"76:" // Height 2: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"add x26, x26, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v10.8h, v17.8h, v0.h[0]\n"
+ "fmla v14.8h, v17.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"sub x27, x27, #0x8\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "fmla v11.8h, v16.8h, v0.h[0]\n"
+ "fmla v15.8h, v16.8h, v1.h[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ "fmla v8.8h, v17.8h, v0.h[1]\n"
+ "fmla v12.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v9.8h, v16.8h, v0.h[1]\n"
+ "fmla v13.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ "fmla v10.8h, v17.8h, v0.h[1]\n"
+ "fmla v14.8h, v17.8h, v1.h[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.8h, v16.8h, v0.h[1]\n"
+ "fmla v15.8h, v16.8h, v1.h[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.8h, v17.8h, v0.h[2]\n"
+ "fmla v12.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.8h, v16.8h, v0.h[2]\n"
+ "fmla v13.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.8h, v17.8h, v0.h[2]\n"
+ "fmla v14.8h, v17.8h, v1.h[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.8h, v16.8h, v0.h[2]\n"
+ "fmla v15.8h, v16.8h, v1.h[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.8h, v17.8h, v0.h[3]\n"
+ "fmla v12.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.8h, v16.8h, v0.h[3]\n"
+ "fmla v13.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
+ "fmla v10.8h, v17.8h, v0.h[3]\n"
+ "fmla v14.8h, v17.8h, v1.h[3]\n"
+ "ldr q17, [x10, #0x100]\n"
+ "fmla v11.8h, v16.8h, v0.h[3]\n"
+ "fmla v15.8h, v16.8h, v1.h[3]\n"
+ "ldr q16, [x10, #0x110]\n"
+ "fmla v8.8h, v17.8h, v0.h[4]\n"
+ "fmla v12.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x10, #0x120]\n"
+ "fmla v9.8h, v16.8h, v0.h[4]\n"
+ "fmla v13.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x10, #0x130]\n"
+ "fmla v10.8h, v17.8h, v0.h[4]\n"
+ "fmla v14.8h, v17.8h, v1.h[4]\n"
+ "ldr q17, [x10, #0x140]\n"
+ "fmla v11.8h, v16.8h, v0.h[4]\n"
+ "fmla v15.8h, v16.8h, v1.h[4]\n"
+ "ldr q16, [x10, #0x150]\n"
+ "fmla v8.8h, v17.8h, v0.h[5]\n"
+ "fmla v12.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x10, #0x160]\n"
+ "fmla v9.8h, v16.8h, v0.h[5]\n"
+ "fmla v13.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x10, #0x170]\n"
+ "fmla v10.8h, v17.8h, v0.h[5]\n"
+ "fmla v14.8h, v17.8h, v1.h[5]\n"
+ "ldr q17, [x10, #0x180]\n"
+ "fmla v11.8h, v16.8h, v0.h[5]\n"
+ "fmla v15.8h, v16.8h, v1.h[5]\n"
+ "ldr q16, [x10, #0x190]\n"
+ "fmla v8.8h, v17.8h, v0.h[6]\n"
+ "fmla v12.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x10, #0x1a0]\n"
+ "fmla v9.8h, v16.8h, v0.h[6]\n"
+ "fmla v13.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x10, #0x1b0]\n"
+ "fmla v10.8h, v17.8h, v0.h[6]\n"
+ "fmla v14.8h, v17.8h, v1.h[6]\n"
+ "ldr q17, [x10, #0x1c0]\n"
+ "fmla v11.8h, v16.8h, v0.h[6]\n"
+ "fmla v15.8h, v16.8h, v1.h[6]\n"
+ "ldr q16, [x10, #0x1d0]\n"
+ "fmla v8.8h, v17.8h, v0.h[7]\n"
+ "fmla v12.8h, v17.8h, v1.h[7]\n"
+ "ldr q17, [x10, #0x1e0]\n"
+ "fmla v9.8h, v16.8h, v0.h[7]\n"
+ "fmla v13.8h, v16.8h, v1.h[7]\n"
+ "ldr q16, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v10.8h, v17.8h, v0.h[7]\n"
+ "fmla v14.8h, v17.8h, v1.h[7]\n"
+ "fmla v11.8h, v16.8h, v0.h[7]\n"
+ "fmla v15.8h, v16.8h, v1.h[7]\n"
"77:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 79f\n"
"78:" // Height 2: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ "fmla v8.8h, v17.8h, v1.h[0]\n"
+ "fmla v12.8h, v17.8h, v0.h[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.8h, v16.8h, v1.h[0]\n"
+ "fmla v13.8h, v16.8h, v0.h[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v10.8h, v17.8h, v1.h[0]\n"
+ "fmla v14.8h, v17.8h, v0.h[0]\n"
"add x10, x10, #0x40\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v11.8h, v16.8h, v1.h[0]\n"
+ "fmla v15.8h, v16.8h, v0.h[0]\n"
"cbnz x27, 78b\n"
"79:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -995,25 +995,25 @@ void a64_hybrid_fp16_mla_6x32 (
"prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
+ "ld1r { v16.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v17.8h\n"
+ "fmin v9.8h, v9.8h, v17.8h\n"
+ "fmin v10.8h, v10.8h, v17.8h\n"
+ "fmin v11.8h, v11.8h, v17.8h\n"
+ "fmin v12.8h, v12.8h, v17.8h\n"
+ "fmin v13.8h, v13.8h, v17.8h\n"
+ "fmin v14.8h, v14.8h, v17.8h\n"
+ "fmin v15.8h, v15.8h, v17.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v16.8h\n"
+ "fmax v11.8h, v11.8h, v16.8h\n"
+ "fmax v12.8h, v12.8h, v16.8h\n"
+ "fmax v13.8h, v13.8h, v16.8h\n"
+ "fmax v14.8h, v14.8h, v16.8h\n"
+ "fmax v15.8h, v15.8h, v16.8h\n"
"80:" // Height 2: No activation
"cmp x11, #0x20\n"
"bge 97f\n"
@@ -1392,13 +1392,13 @@ void a64_hybrid_fp16_mla_6x32 (
"121:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 122f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 123f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1407,8 +1407,8 @@ void a64_hybrid_fp16_mla_6x32 (
"b 123f\n"
"122:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"123:" // Height 3: input setup done
"cmp x27, #0x8\n"
"blt 126f\n"
@@ -1425,139 +1425,139 @@ void a64_hybrid_fp16_mla_6x32 (
"sub x27, x27, #0x8\n"
"add x26, x26, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x25, x25, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v21.8h, v0.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
"cmp x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v18.8h, v21.8h, v2.h[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ "fmla v11.8h, v20.8h, v0.h[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v2.h[0]\n"
+ "ldr q20, [x10, #0x50]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v8.8h, v21.8h, v0.h[1]\n"
+ "fmla v12.8h, v21.8h, v1.h[1]\n"
+ "fmla v16.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ "fmla v9.8h, v20.8h, v0.h[1]\n"
+ "fmla v13.8h, v20.8h, v1.h[1]\n"
+ "fmla v17.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ "fmla v10.8h, v21.8h, v0.h[1]\n"
+ "fmla v14.8h, v21.8h, v1.h[1]\n"
+ "fmla v18.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ "fmla v11.8h, v20.8h, v0.h[1]\n"
+ "fmla v15.8h, v20.8h, v1.h[1]\n"
+ "fmla v19.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ "fmla v8.8h, v21.8h, v0.h[2]\n"
+ "fmla v12.8h, v21.8h, v1.h[2]\n"
+ "fmla v16.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ "fmla v9.8h, v20.8h, v0.h[2]\n"
+ "fmla v13.8h, v20.8h, v1.h[2]\n"
+ "fmla v17.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ "fmla v10.8h, v21.8h, v0.h[2]\n"
+ "fmla v14.8h, v21.8h, v1.h[2]\n"
+ "fmla v18.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ "fmla v11.8h, v20.8h, v0.h[2]\n"
+ "fmla v15.8h, v20.8h, v1.h[2]\n"
+ "fmla v19.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ "fmla v8.8h, v21.8h, v0.h[3]\n"
+ "fmla v12.8h, v21.8h, v1.h[3]\n"
+ "fmla v16.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ "fmla v9.8h, v20.8h, v0.h[3]\n"
+ "fmla v13.8h, v20.8h, v1.h[3]\n"
+ "fmla v17.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
+ "fmla v10.8h, v21.8h, v0.h[3]\n"
+ "fmla v14.8h, v21.8h, v1.h[3]\n"
+ "fmla v18.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0x100]\n"
+ "fmla v11.8h, v20.8h, v0.h[3]\n"
+ "fmla v15.8h, v20.8h, v1.h[3]\n"
+ "fmla v19.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x10, #0x110]\n"
+ "fmla v8.8h, v21.8h, v0.h[4]\n"
+ "fmla v12.8h, v21.8h, v1.h[4]\n"
+ "fmla v16.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x10, #0x120]\n"
+ "fmla v9.8h, v20.8h, v0.h[4]\n"
+ "fmla v13.8h, v20.8h, v1.h[4]\n"
+ "fmla v17.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x10, #0x130]\n"
+ "fmla v10.8h, v21.8h, v0.h[4]\n"
+ "fmla v14.8h, v21.8h, v1.h[4]\n"
+ "fmla v18.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x10, #0x140]\n"
+ "fmla v11.8h, v20.8h, v0.h[4]\n"
+ "fmla v15.8h, v20.8h, v1.h[4]\n"
+ "fmla v19.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x10, #0x150]\n"
+ "fmla v8.8h, v21.8h, v0.h[5]\n"
+ "fmla v12.8h, v21.8h, v1.h[5]\n"
+ "fmla v16.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x10, #0x160]\n"
+ "fmla v9.8h, v20.8h, v0.h[5]\n"
+ "fmla v13.8h, v20.8h, v1.h[5]\n"
+ "fmla v17.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x10, #0x170]\n"
+ "fmla v10.8h, v21.8h, v0.h[5]\n"
+ "fmla v14.8h, v21.8h, v1.h[5]\n"
+ "fmla v18.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x10, #0x180]\n"
+ "fmla v11.8h, v20.8h, v0.h[5]\n"
+ "fmla v15.8h, v20.8h, v1.h[5]\n"
+ "fmla v19.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x10, #0x190]\n"
+ "fmla v8.8h, v21.8h, v0.h[6]\n"
+ "fmla v12.8h, v21.8h, v1.h[6]\n"
+ "fmla v16.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x10, #0x1a0]\n"
+ "fmla v9.8h, v20.8h, v0.h[6]\n"
+ "fmla v13.8h, v20.8h, v1.h[6]\n"
+ "fmla v17.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x10, #0x1b0]\n"
+ "fmla v10.8h, v21.8h, v0.h[6]\n"
+ "fmla v14.8h, v21.8h, v1.h[6]\n"
+ "fmla v18.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x10, #0x1c0]\n"
+ "fmla v11.8h, v20.8h, v0.h[6]\n"
+ "fmla v15.8h, v20.8h, v1.h[6]\n"
+ "fmla v19.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x10, #0x1d0]\n"
+ "fmla v8.8h, v21.8h, v0.h[7]\n"
+ "fmla v12.8h, v21.8h, v1.h[7]\n"
+ "fmla v16.8h, v21.8h, v2.h[7]\n"
+ "ldr q21, [x10, #0x1e0]\n"
+ "fmla v9.8h, v20.8h, v0.h[7]\n"
+ "fmla v13.8h, v20.8h, v1.h[7]\n"
+ "fmla v17.8h, v20.8h, v2.h[7]\n"
+ "ldr q20, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v10.8h, v21.8h, v0.h[7]\n"
+ "fmla v14.8h, v21.8h, v1.h[7]\n"
+ "fmla v18.8h, v21.8h, v2.h[7]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v20.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v20.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v19.8h, v20.8h, v2.h[7]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 124b\n"
@@ -1567,159 +1567,159 @@ void a64_hybrid_fp16_mla_6x32 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x24, x24, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x8\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v21.8h, v0.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v18.8h, v21.8h, v2.h[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ "fmla v11.8h, v20.8h, v0.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v2.h[0]\n"
+ "ldr q20, [x10, #0x50]\n"
+ "fmla v8.8h, v21.8h, v0.h[1]\n"
+ "fmla v12.8h, v21.8h, v1.h[1]\n"
+ "fmla v16.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ "fmla v9.8h, v20.8h, v0.h[1]\n"
+ "fmla v13.8h, v20.8h, v1.h[1]\n"
+ "fmla v17.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ "fmla v10.8h, v21.8h, v0.h[1]\n"
+ "fmla v14.8h, v21.8h, v1.h[1]\n"
+ "fmla v18.8h, v21.8h, v2.h[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ "fmla v11.8h, v20.8h, v0.h[1]\n"
+ "fmla v15.8h, v20.8h, v1.h[1]\n"
+ "fmla v19.8h, v20.8h, v2.h[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ "fmla v8.8h, v21.8h, v0.h[2]\n"
+ "fmla v12.8h, v21.8h, v1.h[2]\n"
+ "fmla v16.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ "fmla v9.8h, v20.8h, v0.h[2]\n"
+ "fmla v13.8h, v20.8h, v1.h[2]\n"
+ "fmla v17.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ "fmla v10.8h, v21.8h, v0.h[2]\n"
+ "fmla v14.8h, v21.8h, v1.h[2]\n"
+ "fmla v18.8h, v21.8h, v2.h[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ "fmla v11.8h, v20.8h, v0.h[2]\n"
+ "fmla v15.8h, v20.8h, v1.h[2]\n"
+ "fmla v19.8h, v20.8h, v2.h[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ "fmla v8.8h, v21.8h, v0.h[3]\n"
+ "fmla v12.8h, v21.8h, v1.h[3]\n"
+ "fmla v16.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ "fmla v9.8h, v20.8h, v0.h[3]\n"
+ "fmla v13.8h, v20.8h, v1.h[3]\n"
+ "fmla v17.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
+ "fmla v10.8h, v21.8h, v0.h[3]\n"
+ "fmla v14.8h, v21.8h, v1.h[3]\n"
+ "fmla v18.8h, v21.8h, v2.h[3]\n"
+ "ldr q21, [x10, #0x100]\n"
+ "fmla v11.8h, v20.8h, v0.h[3]\n"
+ "fmla v15.8h, v20.8h, v1.h[3]\n"
+ "fmla v19.8h, v20.8h, v2.h[3]\n"
+ "ldr q20, [x10, #0x110]\n"
+ "fmla v8.8h, v21.8h, v0.h[4]\n"
+ "fmla v12.8h, v21.8h, v1.h[4]\n"
+ "fmla v16.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x10, #0x120]\n"
+ "fmla v9.8h, v20.8h, v0.h[4]\n"
+ "fmla v13.8h, v20.8h, v1.h[4]\n"
+ "fmla v17.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x10, #0x130]\n"
+ "fmla v10.8h, v21.8h, v0.h[4]\n"
+ "fmla v14.8h, v21.8h, v1.h[4]\n"
+ "fmla v18.8h, v21.8h, v2.h[4]\n"
+ "ldr q21, [x10, #0x140]\n"
+ "fmla v11.8h, v20.8h, v0.h[4]\n"
+ "fmla v15.8h, v20.8h, v1.h[4]\n"
+ "fmla v19.8h, v20.8h, v2.h[4]\n"
+ "ldr q20, [x10, #0x150]\n"
+ "fmla v8.8h, v21.8h, v0.h[5]\n"
+ "fmla v12.8h, v21.8h, v1.h[5]\n"
+ "fmla v16.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x10, #0x160]\n"
+ "fmla v9.8h, v20.8h, v0.h[5]\n"
+ "fmla v13.8h, v20.8h, v1.h[5]\n"
+ "fmla v17.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x10, #0x170]\n"
+ "fmla v10.8h, v21.8h, v0.h[5]\n"
+ "fmla v14.8h, v21.8h, v1.h[5]\n"
+ "fmla v18.8h, v21.8h, v2.h[5]\n"
+ "ldr q21, [x10, #0x180]\n"
+ "fmla v11.8h, v20.8h, v0.h[5]\n"
+ "fmla v15.8h, v20.8h, v1.h[5]\n"
+ "fmla v19.8h, v20.8h, v2.h[5]\n"
+ "ldr q20, [x10, #0x190]\n"
+ "fmla v8.8h, v21.8h, v0.h[6]\n"
+ "fmla v12.8h, v21.8h, v1.h[6]\n"
+ "fmla v16.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x10, #0x1a0]\n"
+ "fmla v9.8h, v20.8h, v0.h[6]\n"
+ "fmla v13.8h, v20.8h, v1.h[6]\n"
+ "fmla v17.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x10, #0x1b0]\n"
+ "fmla v10.8h, v21.8h, v0.h[6]\n"
+ "fmla v14.8h, v21.8h, v1.h[6]\n"
+ "fmla v18.8h, v21.8h, v2.h[6]\n"
+ "ldr q21, [x10, #0x1c0]\n"
+ "fmla v11.8h, v20.8h, v0.h[6]\n"
+ "fmla v15.8h, v20.8h, v1.h[6]\n"
+ "fmla v19.8h, v20.8h, v2.h[6]\n"
+ "ldr q20, [x10, #0x1d0]\n"
+ "fmla v8.8h, v21.8h, v0.h[7]\n"
+ "fmla v12.8h, v21.8h, v1.h[7]\n"
+ "fmla v16.8h, v21.8h, v2.h[7]\n"
+ "ldr q21, [x10, #0x1e0]\n"
+ "fmla v9.8h, v20.8h, v0.h[7]\n"
+ "fmla v13.8h, v20.8h, v1.h[7]\n"
+ "fmla v17.8h, v20.8h, v2.h[7]\n"
+ "ldr q20, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v10.8h, v21.8h, v0.h[7]\n"
+ "fmla v14.8h, v21.8h, v1.h[7]\n"
+ "fmla v18.8h, v21.8h, v2.h[7]\n"
+ "fmla v11.8h, v20.8h, v0.h[7]\n"
+ "fmla v15.8h, v20.8h, v1.h[7]\n"
+ "fmla v19.8h, v20.8h, v2.h[7]\n"
"126:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 128f\n"
"127:" // Height 3: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
"ldr h1, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr h2, [x24], #0x2\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr h0, [x24], #0x2\n"
+ "ldr q21, [x10, #0x0]\n"
+ "fmla v8.8h, v21.8h, v2.h[0]\n"
+ "fmla v12.8h, v21.8h, v1.h[0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ "fmla v16.8h, v21.8h, v0.h[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "fmla v9.8h, v20.8h, v2.h[0]\n"
+ "fmla v13.8h, v20.8h, v1.h[0]\n"
+ "fmla v17.8h, v20.8h, v0.h[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v10.8h, v21.8h, v2.h[0]\n"
+ "fmla v14.8h, v21.8h, v1.h[0]\n"
+ "fmla v18.8h, v21.8h, v0.h[0]\n"
+ "fmla v11.8h, v20.8h, v2.h[0]\n"
+ "fmla v15.8h, v20.8h, v1.h[0]\n"
+ "fmla v19.8h, v20.8h, v0.h[0]\n"
"cbnz x27, 127b\n"
"128:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1734,33 +1734,33 @@ void a64_hybrid_fp16_mla_6x32 (
"prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 129f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v21.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
+ "ld1r { v20.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v21.8h\n"
+ "fmin v9.8h, v9.8h, v21.8h\n"
+ "fmin v10.8h, v10.8h, v21.8h\n"
+ "fmin v11.8h, v11.8h, v21.8h\n"
+ "fmin v12.8h, v12.8h, v21.8h\n"
+ "fmin v13.8h, v13.8h, v21.8h\n"
+ "fmin v14.8h, v14.8h, v21.8h\n"
+ "fmin v15.8h, v15.8h, v21.8h\n"
+ "fmin v16.8h, v16.8h, v21.8h\n"
+ "fmin v17.8h, v17.8h, v21.8h\n"
+ "fmin v18.8h, v18.8h, v21.8h\n"
+ "fmin v19.8h, v19.8h, v21.8h\n"
+ "fmax v8.8h, v8.8h, v20.8h\n"
+ "fmax v9.8h, v9.8h, v20.8h\n"
+ "fmax v10.8h, v10.8h, v20.8h\n"
+ "fmax v11.8h, v11.8h, v20.8h\n"
+ "fmax v12.8h, v12.8h, v20.8h\n"
+ "fmax v13.8h, v13.8h, v20.8h\n"
+ "fmax v14.8h, v14.8h, v20.8h\n"
+ "fmax v15.8h, v15.8h, v20.8h\n"
+ "fmax v16.8h, v16.8h, v20.8h\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
+ "fmax v18.8h, v18.8h, v20.8h\n"
+ "fmax v19.8h, v19.8h, v20.8h\n"
"129:" // Height 3: No activation
"cmp x11, #0x20\n"
"bge 146f\n"
@@ -2220,14 +2220,14 @@ void a64_hybrid_fp16_mla_6x32 (
"170:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 171f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 172f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2237,9 +2237,9 @@ void a64_hybrid_fp16_mla_6x32 (
"b 172f\n"
"171:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"172:" // Height 4: input setup done
"cmp x27, #0x8\n"
"blt 175f\n"
@@ -2258,7 +2258,7 @@ void a64_hybrid_fp16_mla_6x32 (
"add x26, x26, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x25, x25, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2266,165 +2266,165 @@ void a64_hybrid_fp16_mla_6x32 (
"add x23, x23, #0x10\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"cmp x27, #0x10\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v25.8h, v0.h[0]\n"
+ "fmla v14.8h, v25.8h, v1.h[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v18.8h, v25.8h, v2.h[0]\n"
+ "fmla v22.8h, v25.8h, v3.h[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v11.8h, v24.8h, v0.h[0]\n"
+ "fmla v15.8h, v24.8h, v1.h[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v19.8h, v24.8h, v2.h[0]\n"
+ "fmla v23.8h, v24.8h, v3.h[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ "fmla v8.8h, v25.8h, v0.h[1]\n"
+ "fmla v12.8h, v25.8h, v1.h[1]\n"
+ "fmla v16.8h, v25.8h, v2.h[1]\n"
+ "fmla v20.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ "fmla v9.8h, v24.8h, v0.h[1]\n"
+ "fmla v13.8h, v24.8h, v1.h[1]\n"
+ "fmla v17.8h, v24.8h, v2.h[1]\n"
+ "fmla v21.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ "fmla v10.8h, v25.8h, v0.h[1]\n"
+ "fmla v14.8h, v25.8h, v1.h[1]\n"
+ "fmla v18.8h, v25.8h, v2.h[1]\n"
+ "fmla v22.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ "fmla v11.8h, v24.8h, v0.h[1]\n"
+ "fmla v15.8h, v24.8h, v1.h[1]\n"
+ "fmla v19.8h, v24.8h, v2.h[1]\n"
+ "fmla v23.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ "fmla v8.8h, v25.8h, v0.h[2]\n"
+ "fmla v12.8h, v25.8h, v1.h[2]\n"
+ "fmla v16.8h, v25.8h, v2.h[2]\n"
+ "fmla v20.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ "fmla v9.8h, v24.8h, v0.h[2]\n"
+ "fmla v13.8h, v24.8h, v1.h[2]\n"
+ "fmla v17.8h, v24.8h, v2.h[2]\n"
+ "fmla v21.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ "fmla v10.8h, v25.8h, v0.h[2]\n"
+ "fmla v14.8h, v25.8h, v1.h[2]\n"
+ "fmla v18.8h, v25.8h, v2.h[2]\n"
+ "fmla v22.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ "fmla v11.8h, v24.8h, v0.h[2]\n"
+ "fmla v15.8h, v24.8h, v1.h[2]\n"
+ "fmla v19.8h, v24.8h, v2.h[2]\n"
+ "fmla v23.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ "fmla v8.8h, v25.8h, v0.h[3]\n"
+ "fmla v12.8h, v25.8h, v1.h[3]\n"
+ "fmla v16.8h, v25.8h, v2.h[3]\n"
+ "fmla v20.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ "fmla v9.8h, v24.8h, v0.h[3]\n"
+ "fmla v13.8h, v24.8h, v1.h[3]\n"
+ "fmla v17.8h, v24.8h, v2.h[3]\n"
+ "fmla v21.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
+ "fmla v10.8h, v25.8h, v0.h[3]\n"
+ "fmla v14.8h, v25.8h, v1.h[3]\n"
+ "fmla v18.8h, v25.8h, v2.h[3]\n"
+ "fmla v22.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0x100]\n"
+ "fmla v11.8h, v24.8h, v0.h[3]\n"
+ "fmla v15.8h, v24.8h, v1.h[3]\n"
+ "fmla v19.8h, v24.8h, v2.h[3]\n"
+ "fmla v23.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x10, #0x110]\n"
+ "fmla v8.8h, v25.8h, v0.h[4]\n"
+ "fmla v12.8h, v25.8h, v1.h[4]\n"
+ "fmla v16.8h, v25.8h, v2.h[4]\n"
+ "fmla v20.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x10, #0x120]\n"
+ "fmla v9.8h, v24.8h, v0.h[4]\n"
+ "fmla v13.8h, v24.8h, v1.h[4]\n"
+ "fmla v17.8h, v24.8h, v2.h[4]\n"
+ "fmla v21.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x10, #0x130]\n"
+ "fmla v10.8h, v25.8h, v0.h[4]\n"
+ "fmla v14.8h, v25.8h, v1.h[4]\n"
+ "fmla v18.8h, v25.8h, v2.h[4]\n"
+ "fmla v22.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x10, #0x140]\n"
+ "fmla v11.8h, v24.8h, v0.h[4]\n"
+ "fmla v15.8h, v24.8h, v1.h[4]\n"
+ "fmla v19.8h, v24.8h, v2.h[4]\n"
+ "fmla v23.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x10, #0x150]\n"
+ "fmla v8.8h, v25.8h, v0.h[5]\n"
+ "fmla v12.8h, v25.8h, v1.h[5]\n"
+ "fmla v16.8h, v25.8h, v2.h[5]\n"
+ "fmla v20.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x10, #0x160]\n"
+ "fmla v9.8h, v24.8h, v0.h[5]\n"
+ "fmla v13.8h, v24.8h, v1.h[5]\n"
+ "fmla v17.8h, v24.8h, v2.h[5]\n"
+ "fmla v21.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x10, #0x170]\n"
+ "fmla v10.8h, v25.8h, v0.h[5]\n"
+ "fmla v14.8h, v25.8h, v1.h[5]\n"
+ "fmla v18.8h, v25.8h, v2.h[5]\n"
+ "fmla v22.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x10, #0x180]\n"
+ "fmla v11.8h, v24.8h, v0.h[5]\n"
+ "fmla v15.8h, v24.8h, v1.h[5]\n"
+ "fmla v19.8h, v24.8h, v2.h[5]\n"
+ "fmla v23.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x10, #0x190]\n"
+ "fmla v8.8h, v25.8h, v0.h[6]\n"
+ "fmla v12.8h, v25.8h, v1.h[6]\n"
+ "fmla v16.8h, v25.8h, v2.h[6]\n"
+ "fmla v20.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x10, #0x1a0]\n"
+ "fmla v9.8h, v24.8h, v0.h[6]\n"
+ "fmla v13.8h, v24.8h, v1.h[6]\n"
+ "fmla v17.8h, v24.8h, v2.h[6]\n"
+ "fmla v21.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x10, #0x1b0]\n"
+ "fmla v10.8h, v25.8h, v0.h[6]\n"
+ "fmla v14.8h, v25.8h, v1.h[6]\n"
+ "fmla v18.8h, v25.8h, v2.h[6]\n"
+ "fmla v22.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x10, #0x1c0]\n"
+ "fmla v11.8h, v24.8h, v0.h[6]\n"
+ "fmla v15.8h, v24.8h, v1.h[6]\n"
+ "fmla v19.8h, v24.8h, v2.h[6]\n"
+ "fmla v23.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x10, #0x1d0]\n"
+ "fmla v8.8h, v25.8h, v0.h[7]\n"
+ "fmla v12.8h, v25.8h, v1.h[7]\n"
+ "fmla v16.8h, v25.8h, v2.h[7]\n"
+ "fmla v20.8h, v25.8h, v3.h[7]\n"
+ "ldr q25, [x10, #0x1e0]\n"
+ "fmla v9.8h, v24.8h, v0.h[7]\n"
+ "fmla v13.8h, v24.8h, v1.h[7]\n"
+ "fmla v17.8h, v24.8h, v2.h[7]\n"
+ "fmla v21.8h, v24.8h, v3.h[7]\n"
+ "ldr q24, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v10.8h, v25.8h, v0.h[7]\n"
+ "fmla v14.8h, v25.8h, v1.h[7]\n"
+ "fmla v18.8h, v25.8h, v2.h[7]\n"
+ "fmla v22.8h, v25.8h, v3.h[7]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v24.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v24.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v19.8h, v24.8h, v2.h[7]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v23.8h, v24.8h, v3.h[7]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 173b\n"
@@ -2435,7 +2435,7 @@ void a64_hybrid_fp16_mla_6x32 (
"add x25, x25, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x24, x24, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2443,189 +2443,189 @@ void a64_hybrid_fp16_mla_6x32 (
"sub x27, x27, #0x8\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v25.8h, v0.h[0]\n"
+ "fmla v14.8h, v25.8h, v1.h[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v18.8h, v25.8h, v2.h[0]\n"
+ "fmla v22.8h, v25.8h, v3.h[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v11.8h, v24.8h, v0.h[0]\n"
+ "fmla v15.8h, v24.8h, v1.h[0]\n"
+ "fmla v19.8h, v24.8h, v2.h[0]\n"
+ "fmla v23.8h, v24.8h, v3.h[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ "fmla v8.8h, v25.8h, v0.h[1]\n"
+ "fmla v12.8h, v25.8h, v1.h[1]\n"
+ "fmla v16.8h, v25.8h, v2.h[1]\n"
+ "fmla v20.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ "fmla v9.8h, v24.8h, v0.h[1]\n"
+ "fmla v13.8h, v24.8h, v1.h[1]\n"
+ "fmla v17.8h, v24.8h, v2.h[1]\n"
+ "fmla v21.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ "fmla v10.8h, v25.8h, v0.h[1]\n"
+ "fmla v14.8h, v25.8h, v1.h[1]\n"
+ "fmla v18.8h, v25.8h, v2.h[1]\n"
+ "fmla v22.8h, v25.8h, v3.h[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ "fmla v11.8h, v24.8h, v0.h[1]\n"
+ "fmla v15.8h, v24.8h, v1.h[1]\n"
+ "fmla v19.8h, v24.8h, v2.h[1]\n"
+ "fmla v23.8h, v24.8h, v3.h[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ "fmla v8.8h, v25.8h, v0.h[2]\n"
+ "fmla v12.8h, v25.8h, v1.h[2]\n"
+ "fmla v16.8h, v25.8h, v2.h[2]\n"
+ "fmla v20.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ "fmla v9.8h, v24.8h, v0.h[2]\n"
+ "fmla v13.8h, v24.8h, v1.h[2]\n"
+ "fmla v17.8h, v24.8h, v2.h[2]\n"
+ "fmla v21.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ "fmla v10.8h, v25.8h, v0.h[2]\n"
+ "fmla v14.8h, v25.8h, v1.h[2]\n"
+ "fmla v18.8h, v25.8h, v2.h[2]\n"
+ "fmla v22.8h, v25.8h, v3.h[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ "fmla v11.8h, v24.8h, v0.h[2]\n"
+ "fmla v15.8h, v24.8h, v1.h[2]\n"
+ "fmla v19.8h, v24.8h, v2.h[2]\n"
+ "fmla v23.8h, v24.8h, v3.h[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ "fmla v8.8h, v25.8h, v0.h[3]\n"
+ "fmla v12.8h, v25.8h, v1.h[3]\n"
+ "fmla v16.8h, v25.8h, v2.h[3]\n"
+ "fmla v20.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ "fmla v9.8h, v24.8h, v0.h[3]\n"
+ "fmla v13.8h, v24.8h, v1.h[3]\n"
+ "fmla v17.8h, v24.8h, v2.h[3]\n"
+ "fmla v21.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
+ "fmla v10.8h, v25.8h, v0.h[3]\n"
+ "fmla v14.8h, v25.8h, v1.h[3]\n"
+ "fmla v18.8h, v25.8h, v2.h[3]\n"
+ "fmla v22.8h, v25.8h, v3.h[3]\n"
+ "ldr q25, [x10, #0x100]\n"
+ "fmla v11.8h, v24.8h, v0.h[3]\n"
+ "fmla v15.8h, v24.8h, v1.h[3]\n"
+ "fmla v19.8h, v24.8h, v2.h[3]\n"
+ "fmla v23.8h, v24.8h, v3.h[3]\n"
+ "ldr q24, [x10, #0x110]\n"
+ "fmla v8.8h, v25.8h, v0.h[4]\n"
+ "fmla v12.8h, v25.8h, v1.h[4]\n"
+ "fmla v16.8h, v25.8h, v2.h[4]\n"
+ "fmla v20.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x10, #0x120]\n"
+ "fmla v9.8h, v24.8h, v0.h[4]\n"
+ "fmla v13.8h, v24.8h, v1.h[4]\n"
+ "fmla v17.8h, v24.8h, v2.h[4]\n"
+ "fmla v21.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x10, #0x130]\n"
+ "fmla v10.8h, v25.8h, v0.h[4]\n"
+ "fmla v14.8h, v25.8h, v1.h[4]\n"
+ "fmla v18.8h, v25.8h, v2.h[4]\n"
+ "fmla v22.8h, v25.8h, v3.h[4]\n"
+ "ldr q25, [x10, #0x140]\n"
+ "fmla v11.8h, v24.8h, v0.h[4]\n"
+ "fmla v15.8h, v24.8h, v1.h[4]\n"
+ "fmla v19.8h, v24.8h, v2.h[4]\n"
+ "fmla v23.8h, v24.8h, v3.h[4]\n"
+ "ldr q24, [x10, #0x150]\n"
+ "fmla v8.8h, v25.8h, v0.h[5]\n"
+ "fmla v12.8h, v25.8h, v1.h[5]\n"
+ "fmla v16.8h, v25.8h, v2.h[5]\n"
+ "fmla v20.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x10, #0x160]\n"
+ "fmla v9.8h, v24.8h, v0.h[5]\n"
+ "fmla v13.8h, v24.8h, v1.h[5]\n"
+ "fmla v17.8h, v24.8h, v2.h[5]\n"
+ "fmla v21.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x10, #0x170]\n"
+ "fmla v10.8h, v25.8h, v0.h[5]\n"
+ "fmla v14.8h, v25.8h, v1.h[5]\n"
+ "fmla v18.8h, v25.8h, v2.h[5]\n"
+ "fmla v22.8h, v25.8h, v3.h[5]\n"
+ "ldr q25, [x10, #0x180]\n"
+ "fmla v11.8h, v24.8h, v0.h[5]\n"
+ "fmla v15.8h, v24.8h, v1.h[5]\n"
+ "fmla v19.8h, v24.8h, v2.h[5]\n"
+ "fmla v23.8h, v24.8h, v3.h[5]\n"
+ "ldr q24, [x10, #0x190]\n"
+ "fmla v8.8h, v25.8h, v0.h[6]\n"
+ "fmla v12.8h, v25.8h, v1.h[6]\n"
+ "fmla v16.8h, v25.8h, v2.h[6]\n"
+ "fmla v20.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x10, #0x1a0]\n"
+ "fmla v9.8h, v24.8h, v0.h[6]\n"
+ "fmla v13.8h, v24.8h, v1.h[6]\n"
+ "fmla v17.8h, v24.8h, v2.h[6]\n"
+ "fmla v21.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x10, #0x1b0]\n"
+ "fmla v10.8h, v25.8h, v0.h[6]\n"
+ "fmla v14.8h, v25.8h, v1.h[6]\n"
+ "fmla v18.8h, v25.8h, v2.h[6]\n"
+ "fmla v22.8h, v25.8h, v3.h[6]\n"
+ "ldr q25, [x10, #0x1c0]\n"
+ "fmla v11.8h, v24.8h, v0.h[6]\n"
+ "fmla v15.8h, v24.8h, v1.h[6]\n"
+ "fmla v19.8h, v24.8h, v2.h[6]\n"
+ "fmla v23.8h, v24.8h, v3.h[6]\n"
+ "ldr q24, [x10, #0x1d0]\n"
+ "fmla v8.8h, v25.8h, v0.h[7]\n"
+ "fmla v12.8h, v25.8h, v1.h[7]\n"
+ "fmla v16.8h, v25.8h, v2.h[7]\n"
+ "fmla v20.8h, v25.8h, v3.h[7]\n"
+ "ldr q25, [x10, #0x1e0]\n"
+ "fmla v9.8h, v24.8h, v0.h[7]\n"
+ "fmla v13.8h, v24.8h, v1.h[7]\n"
+ "fmla v17.8h, v24.8h, v2.h[7]\n"
+ "fmla v21.8h, v24.8h, v3.h[7]\n"
+ "ldr q24, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v10.8h, v25.8h, v0.h[7]\n"
+ "fmla v14.8h, v25.8h, v1.h[7]\n"
+ "fmla v18.8h, v25.8h, v2.h[7]\n"
+ "fmla v22.8h, v25.8h, v3.h[7]\n"
+ "fmla v11.8h, v24.8h, v0.h[7]\n"
+ "fmla v15.8h, v24.8h, v1.h[7]\n"
+ "fmla v19.8h, v24.8h, v2.h[7]\n"
+ "fmla v23.8h, v24.8h, v3.h[7]\n"
"175:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 177f\n"
"176:" // Height 4: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h2, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ "fmla v8.8h, v25.8h, v3.h[0]\n"
+ "fmla v12.8h, v25.8h, v2.h[0]\n"
+ "fmla v16.8h, v25.8h, v1.h[0]\n"
+ "fmla v20.8h, v25.8h, v0.h[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ "fmla v9.8h, v24.8h, v3.h[0]\n"
+ "fmla v13.8h, v24.8h, v2.h[0]\n"
+ "fmla v17.8h, v24.8h, v1.h[0]\n"
+ "fmla v21.8h, v24.8h, v0.h[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v10.8h, v25.8h, v3.h[0]\n"
+ "fmla v14.8h, v25.8h, v2.h[0]\n"
+ "fmla v18.8h, v25.8h, v1.h[0]\n"
+ "fmla v22.8h, v25.8h, v0.h[0]\n"
+ "fmla v11.8h, v24.8h, v3.h[0]\n"
+ "fmla v15.8h, v24.8h, v2.h[0]\n"
+ "fmla v19.8h, v24.8h, v1.h[0]\n"
+ "fmla v23.8h, v24.8h, v0.h[0]\n"
"cbnz x27, 176b\n"
"177:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2642,41 +2642,41 @@ void a64_hybrid_fp16_mla_6x32 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 178f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmin v20.8h, v20.8h, v1.8h\n"
- "fmin v21.8h, v21.8h, v1.8h\n"
- "fmin v22.8h, v22.8h, v1.8h\n"
- "fmin v23.8h, v23.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
- "fmax v20.8h, v20.8h, v0.8h\n"
- "fmax v21.8h, v21.8h, v0.8h\n"
- "fmax v22.8h, v22.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v0.8h\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v25.8h\n"
+ "fmin v9.8h, v9.8h, v25.8h\n"
+ "fmin v10.8h, v10.8h, v25.8h\n"
+ "fmin v11.8h, v11.8h, v25.8h\n"
+ "fmin v12.8h, v12.8h, v25.8h\n"
+ "fmin v13.8h, v13.8h, v25.8h\n"
+ "fmin v14.8h, v14.8h, v25.8h\n"
+ "fmin v15.8h, v15.8h, v25.8h\n"
+ "fmin v16.8h, v16.8h, v25.8h\n"
+ "fmin v17.8h, v17.8h, v25.8h\n"
+ "fmin v18.8h, v18.8h, v25.8h\n"
+ "fmin v19.8h, v19.8h, v25.8h\n"
+ "fmin v20.8h, v20.8h, v25.8h\n"
+ "fmin v21.8h, v21.8h, v25.8h\n"
+ "fmin v22.8h, v22.8h, v25.8h\n"
+ "fmin v23.8h, v23.8h, v25.8h\n"
+ "fmax v8.8h, v8.8h, v24.8h\n"
+ "fmax v9.8h, v9.8h, v24.8h\n"
+ "fmax v10.8h, v10.8h, v24.8h\n"
+ "fmax v11.8h, v11.8h, v24.8h\n"
+ "fmax v12.8h, v12.8h, v24.8h\n"
+ "fmax v13.8h, v13.8h, v24.8h\n"
+ "fmax v14.8h, v14.8h, v24.8h\n"
+ "fmax v15.8h, v15.8h, v24.8h\n"
+ "fmax v16.8h, v16.8h, v24.8h\n"
+ "fmax v17.8h, v17.8h, v24.8h\n"
+ "fmax v18.8h, v18.8h, v24.8h\n"
+ "fmax v19.8h, v19.8h, v24.8h\n"
+ "fmax v20.8h, v20.8h, v24.8h\n"
+ "fmax v21.8h, v21.8h, v24.8h\n"
+ "fmax v22.8h, v22.8h, v24.8h\n"
+ "fmax v23.8h, v23.8h, v24.8h\n"
"178:" // Height 4: No activation
"cmp x11, #0x20\n"
"bge 195f\n"
@@ -3217,15 +3217,15 @@ void a64_hybrid_fp16_mla_6x32 (
"219:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 220f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 221f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -3236,10 +3236,10 @@ void a64_hybrid_fp16_mla_6x32 (
"b 221f\n"
"220:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"221:" // Height 5: input setup done
"cmp x27, #0x8\n"
"blt 224f\n"
@@ -3262,7 +3262,7 @@ void a64_hybrid_fp16_mla_6x32 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x23, x23, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3271,196 +3271,196 @@ void a64_hybrid_fp16_mla_6x32 (
"cmp x27, #0x10\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v29.8h, v0.h[0]\n"
+ "fmla v14.8h, v29.8h, v1.h[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v3.h[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v26.8h, v29.8h, v4.h[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ "fmla v11.8h, v28.8h, v0.h[0]\n"
+ "fmla v15.8h, v28.8h, v1.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v3.h[0]\n"
+ "fmla v27.8h, v28.8h, v4.h[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ "fmla v8.8h, v29.8h, v0.h[1]\n"
+ "fmla v12.8h, v29.8h, v1.h[1]\n"
+ "fmla v16.8h, v29.8h, v2.h[1]\n"
+ "fmla v20.8h, v29.8h, v3.h[1]\n"
+ "fmla v24.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ "fmla v9.8h, v28.8h, v0.h[1]\n"
+ "fmla v13.8h, v28.8h, v1.h[1]\n"
+ "fmla v17.8h, v28.8h, v2.h[1]\n"
+ "fmla v21.8h, v28.8h, v3.h[1]\n"
+ "fmla v25.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ "fmla v10.8h, v29.8h, v0.h[1]\n"
+ "fmla v14.8h, v29.8h, v1.h[1]\n"
+ "fmla v18.8h, v29.8h, v2.h[1]\n"
+ "fmla v22.8h, v29.8h, v3.h[1]\n"
+ "fmla v26.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ "fmla v11.8h, v28.8h, v0.h[1]\n"
+ "fmla v15.8h, v28.8h, v1.h[1]\n"
+ "fmla v19.8h, v28.8h, v2.h[1]\n"
+ "fmla v23.8h, v28.8h, v3.h[1]\n"
+ "fmla v27.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ "fmla v8.8h, v29.8h, v0.h[2]\n"
+ "fmla v12.8h, v29.8h, v1.h[2]\n"
+ "fmla v16.8h, v29.8h, v2.h[2]\n"
+ "fmla v20.8h, v29.8h, v3.h[2]\n"
+ "fmla v24.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ "fmla v9.8h, v28.8h, v0.h[2]\n"
+ "fmla v13.8h, v28.8h, v1.h[2]\n"
+ "fmla v17.8h, v28.8h, v2.h[2]\n"
+ "fmla v21.8h, v28.8h, v3.h[2]\n"
+ "fmla v25.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ "fmla v10.8h, v29.8h, v0.h[2]\n"
+ "fmla v14.8h, v29.8h, v1.h[2]\n"
+ "fmla v18.8h, v29.8h, v2.h[2]\n"
+ "fmla v22.8h, v29.8h, v3.h[2]\n"
+ "fmla v26.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ "fmla v11.8h, v28.8h, v0.h[2]\n"
+ "fmla v15.8h, v28.8h, v1.h[2]\n"
+ "fmla v19.8h, v28.8h, v2.h[2]\n"
+ "fmla v23.8h, v28.8h, v3.h[2]\n"
+ "fmla v27.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ "fmla v8.8h, v29.8h, v0.h[3]\n"
+ "fmla v12.8h, v29.8h, v1.h[3]\n"
+ "fmla v16.8h, v29.8h, v2.h[3]\n"
+ "fmla v20.8h, v29.8h, v3.h[3]\n"
+ "fmla v24.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ "fmla v9.8h, v28.8h, v0.h[3]\n"
+ "fmla v13.8h, v28.8h, v1.h[3]\n"
+ "fmla v17.8h, v28.8h, v2.h[3]\n"
+ "fmla v21.8h, v28.8h, v3.h[3]\n"
+ "fmla v25.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
+ "fmla v10.8h, v29.8h, v0.h[3]\n"
+ "fmla v14.8h, v29.8h, v1.h[3]\n"
+ "fmla v18.8h, v29.8h, v2.h[3]\n"
+ "fmla v22.8h, v29.8h, v3.h[3]\n"
+ "fmla v26.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0x100]\n"
+ "fmla v11.8h, v28.8h, v0.h[3]\n"
+ "fmla v15.8h, v28.8h, v1.h[3]\n"
+ "fmla v19.8h, v28.8h, v2.h[3]\n"
+ "fmla v23.8h, v28.8h, v3.h[3]\n"
+ "fmla v27.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x10, #0x110]\n"
+ "fmla v8.8h, v29.8h, v0.h[4]\n"
+ "fmla v12.8h, v29.8h, v1.h[4]\n"
+ "fmla v16.8h, v29.8h, v2.h[4]\n"
+ "fmla v20.8h, v29.8h, v3.h[4]\n"
+ "fmla v24.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x10, #0x120]\n"
+ "fmla v9.8h, v28.8h, v0.h[4]\n"
+ "fmla v13.8h, v28.8h, v1.h[4]\n"
+ "fmla v17.8h, v28.8h, v2.h[4]\n"
+ "fmla v21.8h, v28.8h, v3.h[4]\n"
+ "fmla v25.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x10, #0x130]\n"
+ "fmla v10.8h, v29.8h, v0.h[4]\n"
+ "fmla v14.8h, v29.8h, v1.h[4]\n"
+ "fmla v18.8h, v29.8h, v2.h[4]\n"
+ "fmla v22.8h, v29.8h, v3.h[4]\n"
+ "fmla v26.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x10, #0x140]\n"
+ "fmla v11.8h, v28.8h, v0.h[4]\n"
+ "fmla v15.8h, v28.8h, v1.h[4]\n"
+ "fmla v19.8h, v28.8h, v2.h[4]\n"
+ "fmla v23.8h, v28.8h, v3.h[4]\n"
+ "fmla v27.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x10, #0x150]\n"
+ "fmla v8.8h, v29.8h, v0.h[5]\n"
+ "fmla v12.8h, v29.8h, v1.h[5]\n"
+ "fmla v16.8h, v29.8h, v2.h[5]\n"
+ "fmla v20.8h, v29.8h, v3.h[5]\n"
+ "fmla v24.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x10, #0x160]\n"
+ "fmla v9.8h, v28.8h, v0.h[5]\n"
+ "fmla v13.8h, v28.8h, v1.h[5]\n"
+ "fmla v17.8h, v28.8h, v2.h[5]\n"
+ "fmla v21.8h, v28.8h, v3.h[5]\n"
+ "fmla v25.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x10, #0x170]\n"
+ "fmla v10.8h, v29.8h, v0.h[5]\n"
+ "fmla v14.8h, v29.8h, v1.h[5]\n"
+ "fmla v18.8h, v29.8h, v2.h[5]\n"
+ "fmla v22.8h, v29.8h, v3.h[5]\n"
+ "fmla v26.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x10, #0x180]\n"
+ "fmla v11.8h, v28.8h, v0.h[5]\n"
+ "fmla v15.8h, v28.8h, v1.h[5]\n"
+ "fmla v19.8h, v28.8h, v2.h[5]\n"
+ "fmla v23.8h, v28.8h, v3.h[5]\n"
+ "fmla v27.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x10, #0x190]\n"
+ "fmla v8.8h, v29.8h, v0.h[6]\n"
+ "fmla v12.8h, v29.8h, v1.h[6]\n"
+ "fmla v16.8h, v29.8h, v2.h[6]\n"
+ "fmla v20.8h, v29.8h, v3.h[6]\n"
+ "fmla v24.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x10, #0x1a0]\n"
+ "fmla v9.8h, v28.8h, v0.h[6]\n"
+ "fmla v13.8h, v28.8h, v1.h[6]\n"
+ "fmla v17.8h, v28.8h, v2.h[6]\n"
+ "fmla v21.8h, v28.8h, v3.h[6]\n"
+ "fmla v25.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x10, #0x1b0]\n"
+ "fmla v10.8h, v29.8h, v0.h[6]\n"
+ "fmla v14.8h, v29.8h, v1.h[6]\n"
+ "fmla v18.8h, v29.8h, v2.h[6]\n"
+ "fmla v22.8h, v29.8h, v3.h[6]\n"
+ "fmla v26.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x10, #0x1c0]\n"
+ "fmla v11.8h, v28.8h, v0.h[6]\n"
+ "fmla v15.8h, v28.8h, v1.h[6]\n"
+ "fmla v19.8h, v28.8h, v2.h[6]\n"
+ "fmla v23.8h, v28.8h, v3.h[6]\n"
+ "fmla v27.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x10, #0x1d0]\n"
+ "fmla v8.8h, v29.8h, v0.h[7]\n"
+ "fmla v12.8h, v29.8h, v1.h[7]\n"
+ "fmla v16.8h, v29.8h, v2.h[7]\n"
+ "fmla v20.8h, v29.8h, v3.h[7]\n"
+ "fmla v24.8h, v29.8h, v4.h[7]\n"
+ "ldr q29, [x10, #0x1e0]\n"
+ "fmla v9.8h, v28.8h, v0.h[7]\n"
+ "fmla v13.8h, v28.8h, v1.h[7]\n"
+ "fmla v17.8h, v28.8h, v2.h[7]\n"
+ "fmla v21.8h, v28.8h, v3.h[7]\n"
+ "fmla v25.8h, v28.8h, v4.h[7]\n"
+ "ldr q28, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v10.8h, v29.8h, v0.h[7]\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "fmla v18.8h, v29.8h, v2.h[7]\n"
+ "fmla v22.8h, v29.8h, v3.h[7]\n"
+ "fmla v26.8h, v29.8h, v4.h[7]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v11.8h, v28.8h, v0.h[7]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v15.8h, v28.8h, v1.h[7]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v19.8h, v28.8h, v2.h[7]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v23.8h, v28.8h, v3.h[7]\n"
"ldr q3, [x23, #0x0]\n"
- "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v27.8h, v28.8h, v4.h[7]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 222b\n"
@@ -3474,7 +3474,7 @@ void a64_hybrid_fp16_mla_6x32 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"add x22, x22, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3483,224 +3483,224 @@ void a64_hybrid_fp16_mla_6x32 (
"prfm pldl1keep, [x26, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v10.8h, v29.8h, v0.h[0]\n"
+ "fmla v14.8h, v29.8h, v1.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v3.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.8h, v6.8h, v0.h[1]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v16.8h, v6.8h, v2.h[1]\n"
- "fmla v20.8h, v6.8h, v3.h[1]\n"
- "fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.8h, v7.8h, v0.h[1]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v17.8h, v7.8h, v2.h[1]\n"
- "fmla v21.8h, v7.8h, v3.h[1]\n"
- "fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.8h, v6.8h, v0.h[1]\n"
- "fmla v14.8h, v6.8h, v1.h[1]\n"
- "fmla v18.8h, v6.8h, v2.h[1]\n"
- "fmla v22.8h, v6.8h, v3.h[1]\n"
- "fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.8h, v7.8h, v0.h[1]\n"
- "fmla v15.8h, v7.8h, v1.h[1]\n"
- "fmla v19.8h, v7.8h, v2.h[1]\n"
- "fmla v23.8h, v7.8h, v3.h[1]\n"
- "fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.8h, v6.8h, v0.h[2]\n"
- "fmla v12.8h, v6.8h, v1.h[2]\n"
- "fmla v16.8h, v6.8h, v2.h[2]\n"
- "fmla v20.8h, v6.8h, v3.h[2]\n"
- "fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.8h, v7.8h, v0.h[2]\n"
- "fmla v13.8h, v7.8h, v1.h[2]\n"
- "fmla v17.8h, v7.8h, v2.h[2]\n"
- "fmla v21.8h, v7.8h, v3.h[2]\n"
- "fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.8h, v6.8h, v0.h[2]\n"
- "fmla v14.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v2.h[2]\n"
- "fmla v22.8h, v6.8h, v3.h[2]\n"
- "fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.8h, v7.8h, v0.h[2]\n"
- "fmla v15.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v2.h[2]\n"
- "fmla v23.8h, v7.8h, v3.h[2]\n"
- "fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.8h, v6.8h, v0.h[3]\n"
- "fmla v12.8h, v6.8h, v1.h[3]\n"
- "fmla v16.8h, v6.8h, v2.h[3]\n"
- "fmla v20.8h, v6.8h, v3.h[3]\n"
- "fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.8h, v7.8h, v0.h[3]\n"
- "fmla v13.8h, v7.8h, v1.h[3]\n"
- "fmla v17.8h, v7.8h, v2.h[3]\n"
- "fmla v21.8h, v7.8h, v3.h[3]\n"
- "fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x10, #0xf0]\n"
- "fmla v10.8h, v6.8h, v0.h[3]\n"
- "fmla v14.8h, v6.8h, v1.h[3]\n"
- "fmla v18.8h, v6.8h, v2.h[3]\n"
- "fmla v22.8h, v6.8h, v3.h[3]\n"
- "fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x10, #0x100]\n"
- "fmla v11.8h, v7.8h, v0.h[3]\n"
- "fmla v15.8h, v7.8h, v1.h[3]\n"
- "fmla v19.8h, v7.8h, v2.h[3]\n"
- "fmla v23.8h, v7.8h, v3.h[3]\n"
- "fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x10, #0x110]\n"
- "fmla v8.8h, v6.8h, v0.h[4]\n"
- "fmla v12.8h, v6.8h, v1.h[4]\n"
- "fmla v16.8h, v6.8h, v2.h[4]\n"
- "fmla v20.8h, v6.8h, v3.h[4]\n"
- "fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x10, #0x120]\n"
- "fmla v9.8h, v7.8h, v0.h[4]\n"
- "fmla v13.8h, v7.8h, v1.h[4]\n"
- "fmla v17.8h, v7.8h, v2.h[4]\n"
- "fmla v21.8h, v7.8h, v3.h[4]\n"
- "fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x10, #0x130]\n"
- "fmla v10.8h, v6.8h, v0.h[4]\n"
- "fmla v14.8h, v6.8h, v1.h[4]\n"
- "fmla v18.8h, v6.8h, v2.h[4]\n"
- "fmla v22.8h, v6.8h, v3.h[4]\n"
- "fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x10, #0x140]\n"
- "fmla v11.8h, v7.8h, v0.h[4]\n"
- "fmla v15.8h, v7.8h, v1.h[4]\n"
- "fmla v19.8h, v7.8h, v2.h[4]\n"
- "fmla v23.8h, v7.8h, v3.h[4]\n"
- "fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x10, #0x150]\n"
- "fmla v8.8h, v6.8h, v0.h[5]\n"
- "fmla v12.8h, v6.8h, v1.h[5]\n"
- "fmla v16.8h, v6.8h, v2.h[5]\n"
- "fmla v20.8h, v6.8h, v3.h[5]\n"
- "fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x10, #0x160]\n"
- "fmla v9.8h, v7.8h, v0.h[5]\n"
- "fmla v13.8h, v7.8h, v1.h[5]\n"
- "fmla v17.8h, v7.8h, v2.h[5]\n"
- "fmla v21.8h, v7.8h, v3.h[5]\n"
- "fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x10, #0x170]\n"
- "fmla v10.8h, v6.8h, v0.h[5]\n"
- "fmla v14.8h, v6.8h, v1.h[5]\n"
- "fmla v18.8h, v6.8h, v2.h[5]\n"
- "fmla v22.8h, v6.8h, v3.h[5]\n"
- "fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x10, #0x180]\n"
- "fmla v11.8h, v7.8h, v0.h[5]\n"
- "fmla v15.8h, v7.8h, v1.h[5]\n"
- "fmla v19.8h, v7.8h, v2.h[5]\n"
- "fmla v23.8h, v7.8h, v3.h[5]\n"
- "fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x10, #0x190]\n"
- "fmla v8.8h, v6.8h, v0.h[6]\n"
- "fmla v12.8h, v6.8h, v1.h[6]\n"
- "fmla v16.8h, v6.8h, v2.h[6]\n"
- "fmla v20.8h, v6.8h, v3.h[6]\n"
- "fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x10, #0x1a0]\n"
- "fmla v9.8h, v7.8h, v0.h[6]\n"
- "fmla v13.8h, v7.8h, v1.h[6]\n"
- "fmla v17.8h, v7.8h, v2.h[6]\n"
- "fmla v21.8h, v7.8h, v3.h[6]\n"
- "fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x10, #0x1b0]\n"
- "fmla v10.8h, v6.8h, v0.h[6]\n"
- "fmla v14.8h, v6.8h, v1.h[6]\n"
- "fmla v18.8h, v6.8h, v2.h[6]\n"
- "fmla v22.8h, v6.8h, v3.h[6]\n"
- "fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x10, #0x1c0]\n"
- "fmla v11.8h, v7.8h, v0.h[6]\n"
- "fmla v15.8h, v7.8h, v1.h[6]\n"
- "fmla v19.8h, v7.8h, v2.h[6]\n"
- "fmla v23.8h, v7.8h, v3.h[6]\n"
- "fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x10, #0x1d0]\n"
- "fmla v8.8h, v6.8h, v0.h[7]\n"
- "fmla v12.8h, v6.8h, v1.h[7]\n"
- "fmla v16.8h, v6.8h, v2.h[7]\n"
- "fmla v20.8h, v6.8h, v3.h[7]\n"
- "fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x10, #0x1e0]\n"
- "fmla v9.8h, v7.8h, v0.h[7]\n"
- "fmla v13.8h, v7.8h, v1.h[7]\n"
- "fmla v17.8h, v7.8h, v2.h[7]\n"
- "fmla v21.8h, v7.8h, v3.h[7]\n"
- "fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x10, #0x1f0]\n"
+ "fmla v26.8h, v29.8h, v4.h[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ "fmla v11.8h, v28.8h, v0.h[0]\n"
+ "fmla v15.8h, v28.8h, v1.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v3.h[0]\n"
+ "fmla v27.8h, v28.8h, v4.h[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ "fmla v8.8h, v29.8h, v0.h[1]\n"
+ "fmla v12.8h, v29.8h, v1.h[1]\n"
+ "fmla v16.8h, v29.8h, v2.h[1]\n"
+ "fmla v20.8h, v29.8h, v3.h[1]\n"
+ "fmla v24.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ "fmla v9.8h, v28.8h, v0.h[1]\n"
+ "fmla v13.8h, v28.8h, v1.h[1]\n"
+ "fmla v17.8h, v28.8h, v2.h[1]\n"
+ "fmla v21.8h, v28.8h, v3.h[1]\n"
+ "fmla v25.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ "fmla v10.8h, v29.8h, v0.h[1]\n"
+ "fmla v14.8h, v29.8h, v1.h[1]\n"
+ "fmla v18.8h, v29.8h, v2.h[1]\n"
+ "fmla v22.8h, v29.8h, v3.h[1]\n"
+ "fmla v26.8h, v29.8h, v4.h[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ "fmla v11.8h, v28.8h, v0.h[1]\n"
+ "fmla v15.8h, v28.8h, v1.h[1]\n"
+ "fmla v19.8h, v28.8h, v2.h[1]\n"
+ "fmla v23.8h, v28.8h, v3.h[1]\n"
+ "fmla v27.8h, v28.8h, v4.h[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ "fmla v8.8h, v29.8h, v0.h[2]\n"
+ "fmla v12.8h, v29.8h, v1.h[2]\n"
+ "fmla v16.8h, v29.8h, v2.h[2]\n"
+ "fmla v20.8h, v29.8h, v3.h[2]\n"
+ "fmla v24.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ "fmla v9.8h, v28.8h, v0.h[2]\n"
+ "fmla v13.8h, v28.8h, v1.h[2]\n"
+ "fmla v17.8h, v28.8h, v2.h[2]\n"
+ "fmla v21.8h, v28.8h, v3.h[2]\n"
+ "fmla v25.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ "fmla v10.8h, v29.8h, v0.h[2]\n"
+ "fmla v14.8h, v29.8h, v1.h[2]\n"
+ "fmla v18.8h, v29.8h, v2.h[2]\n"
+ "fmla v22.8h, v29.8h, v3.h[2]\n"
+ "fmla v26.8h, v29.8h, v4.h[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ "fmla v11.8h, v28.8h, v0.h[2]\n"
+ "fmla v15.8h, v28.8h, v1.h[2]\n"
+ "fmla v19.8h, v28.8h, v2.h[2]\n"
+ "fmla v23.8h, v28.8h, v3.h[2]\n"
+ "fmla v27.8h, v28.8h, v4.h[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ "fmla v8.8h, v29.8h, v0.h[3]\n"
+ "fmla v12.8h, v29.8h, v1.h[3]\n"
+ "fmla v16.8h, v29.8h, v2.h[3]\n"
+ "fmla v20.8h, v29.8h, v3.h[3]\n"
+ "fmla v24.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ "fmla v9.8h, v28.8h, v0.h[3]\n"
+ "fmla v13.8h, v28.8h, v1.h[3]\n"
+ "fmla v17.8h, v28.8h, v2.h[3]\n"
+ "fmla v21.8h, v28.8h, v3.h[3]\n"
+ "fmla v25.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
+ "fmla v10.8h, v29.8h, v0.h[3]\n"
+ "fmla v14.8h, v29.8h, v1.h[3]\n"
+ "fmla v18.8h, v29.8h, v2.h[3]\n"
+ "fmla v22.8h, v29.8h, v3.h[3]\n"
+ "fmla v26.8h, v29.8h, v4.h[3]\n"
+ "ldr q29, [x10, #0x100]\n"
+ "fmla v11.8h, v28.8h, v0.h[3]\n"
+ "fmla v15.8h, v28.8h, v1.h[3]\n"
+ "fmla v19.8h, v28.8h, v2.h[3]\n"
+ "fmla v23.8h, v28.8h, v3.h[3]\n"
+ "fmla v27.8h, v28.8h, v4.h[3]\n"
+ "ldr q28, [x10, #0x110]\n"
+ "fmla v8.8h, v29.8h, v0.h[4]\n"
+ "fmla v12.8h, v29.8h, v1.h[4]\n"
+ "fmla v16.8h, v29.8h, v2.h[4]\n"
+ "fmla v20.8h, v29.8h, v3.h[4]\n"
+ "fmla v24.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x10, #0x120]\n"
+ "fmla v9.8h, v28.8h, v0.h[4]\n"
+ "fmla v13.8h, v28.8h, v1.h[4]\n"
+ "fmla v17.8h, v28.8h, v2.h[4]\n"
+ "fmla v21.8h, v28.8h, v3.h[4]\n"
+ "fmla v25.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x10, #0x130]\n"
+ "fmla v10.8h, v29.8h, v0.h[4]\n"
+ "fmla v14.8h, v29.8h, v1.h[4]\n"
+ "fmla v18.8h, v29.8h, v2.h[4]\n"
+ "fmla v22.8h, v29.8h, v3.h[4]\n"
+ "fmla v26.8h, v29.8h, v4.h[4]\n"
+ "ldr q29, [x10, #0x140]\n"
+ "fmla v11.8h, v28.8h, v0.h[4]\n"
+ "fmla v15.8h, v28.8h, v1.h[4]\n"
+ "fmla v19.8h, v28.8h, v2.h[4]\n"
+ "fmla v23.8h, v28.8h, v3.h[4]\n"
+ "fmla v27.8h, v28.8h, v4.h[4]\n"
+ "ldr q28, [x10, #0x150]\n"
+ "fmla v8.8h, v29.8h, v0.h[5]\n"
+ "fmla v12.8h, v29.8h, v1.h[5]\n"
+ "fmla v16.8h, v29.8h, v2.h[5]\n"
+ "fmla v20.8h, v29.8h, v3.h[5]\n"
+ "fmla v24.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x10, #0x160]\n"
+ "fmla v9.8h, v28.8h, v0.h[5]\n"
+ "fmla v13.8h, v28.8h, v1.h[5]\n"
+ "fmla v17.8h, v28.8h, v2.h[5]\n"
+ "fmla v21.8h, v28.8h, v3.h[5]\n"
+ "fmla v25.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x10, #0x170]\n"
+ "fmla v10.8h, v29.8h, v0.h[5]\n"
+ "fmla v14.8h, v29.8h, v1.h[5]\n"
+ "fmla v18.8h, v29.8h, v2.h[5]\n"
+ "fmla v22.8h, v29.8h, v3.h[5]\n"
+ "fmla v26.8h, v29.8h, v4.h[5]\n"
+ "ldr q29, [x10, #0x180]\n"
+ "fmla v11.8h, v28.8h, v0.h[5]\n"
+ "fmla v15.8h, v28.8h, v1.h[5]\n"
+ "fmla v19.8h, v28.8h, v2.h[5]\n"
+ "fmla v23.8h, v28.8h, v3.h[5]\n"
+ "fmla v27.8h, v28.8h, v4.h[5]\n"
+ "ldr q28, [x10, #0x190]\n"
+ "fmla v8.8h, v29.8h, v0.h[6]\n"
+ "fmla v12.8h, v29.8h, v1.h[6]\n"
+ "fmla v16.8h, v29.8h, v2.h[6]\n"
+ "fmla v20.8h, v29.8h, v3.h[6]\n"
+ "fmla v24.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x10, #0x1a0]\n"
+ "fmla v9.8h, v28.8h, v0.h[6]\n"
+ "fmla v13.8h, v28.8h, v1.h[6]\n"
+ "fmla v17.8h, v28.8h, v2.h[6]\n"
+ "fmla v21.8h, v28.8h, v3.h[6]\n"
+ "fmla v25.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x10, #0x1b0]\n"
+ "fmla v10.8h, v29.8h, v0.h[6]\n"
+ "fmla v14.8h, v29.8h, v1.h[6]\n"
+ "fmla v18.8h, v29.8h, v2.h[6]\n"
+ "fmla v22.8h, v29.8h, v3.h[6]\n"
+ "fmla v26.8h, v29.8h, v4.h[6]\n"
+ "ldr q29, [x10, #0x1c0]\n"
+ "fmla v11.8h, v28.8h, v0.h[6]\n"
+ "fmla v15.8h, v28.8h, v1.h[6]\n"
+ "fmla v19.8h, v28.8h, v2.h[6]\n"
+ "fmla v23.8h, v28.8h, v3.h[6]\n"
+ "fmla v27.8h, v28.8h, v4.h[6]\n"
+ "ldr q28, [x10, #0x1d0]\n"
+ "fmla v8.8h, v29.8h, v0.h[7]\n"
+ "fmla v12.8h, v29.8h, v1.h[7]\n"
+ "fmla v16.8h, v29.8h, v2.h[7]\n"
+ "fmla v20.8h, v29.8h, v3.h[7]\n"
+ "fmla v24.8h, v29.8h, v4.h[7]\n"
+ "ldr q29, [x10, #0x1e0]\n"
+ "fmla v9.8h, v28.8h, v0.h[7]\n"
+ "fmla v13.8h, v28.8h, v1.h[7]\n"
+ "fmla v17.8h, v28.8h, v2.h[7]\n"
+ "fmla v21.8h, v28.8h, v3.h[7]\n"
+ "fmla v25.8h, v28.8h, v4.h[7]\n"
+ "ldr q28, [x10, #0x1f0]\n"
"add x10, x10, #0x200\n"
- "fmla v10.8h, v6.8h, v0.h[7]\n"
- "fmla v14.8h, v6.8h, v1.h[7]\n"
- "fmla v18.8h, v6.8h, v2.h[7]\n"
- "fmla v22.8h, v6.8h, v3.h[7]\n"
- "fmla v26.8h, v6.8h, v4.h[7]\n"
- "fmla v11.8h, v7.8h, v0.h[7]\n"
- "fmla v15.8h, v7.8h, v1.h[7]\n"
- "fmla v19.8h, v7.8h, v2.h[7]\n"
- "fmla v23.8h, v7.8h, v3.h[7]\n"
- "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v10.8h, v29.8h, v0.h[7]\n"
+ "fmla v14.8h, v29.8h, v1.h[7]\n"
+ "fmla v18.8h, v29.8h, v2.h[7]\n"
+ "fmla v22.8h, v29.8h, v3.h[7]\n"
+ "fmla v26.8h, v29.8h, v4.h[7]\n"
+ "fmla v11.8h, v28.8h, v0.h[7]\n"
+ "fmla v15.8h, v28.8h, v1.h[7]\n"
+ "fmla v19.8h, v28.8h, v2.h[7]\n"
+ "fmla v23.8h, v28.8h, v3.h[7]\n"
+ "fmla v27.8h, v28.8h, v4.h[7]\n"
"224:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 226f\n"
"225:" // Height 5: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h4, [x26], #0x2\n"
+ "ldr h3, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
"ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr h1, [x23], #0x2\n"
+ "ldr h0, [x22], #0x2\n"
+ "ldr q29, [x10, #0x0]\n"
+ "fmla v8.8h, v29.8h, v4.h[0]\n"
+ "fmla v12.8h, v29.8h, v3.h[0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ "fmla v16.8h, v29.8h, v2.h[0]\n"
+ "fmla v20.8h, v29.8h, v1.h[0]\n"
+ "fmla v24.8h, v29.8h, v0.h[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "fmla v9.8h, v28.8h, v4.h[0]\n"
+ "fmla v13.8h, v28.8h, v3.h[0]\n"
+ "fmla v17.8h, v28.8h, v2.h[0]\n"
+ "fmla v21.8h, v28.8h, v1.h[0]\n"
+ "fmla v25.8h, v28.8h, v0.h[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v10.8h, v29.8h, v4.h[0]\n"
+ "fmla v14.8h, v29.8h, v3.h[0]\n"
+ "fmla v18.8h, v29.8h, v2.h[0]\n"
+ "fmla v22.8h, v29.8h, v1.h[0]\n"
+ "fmla v26.8h, v29.8h, v0.h[0]\n"
+ "fmla v11.8h, v28.8h, v4.h[0]\n"
+ "fmla v15.8h, v28.8h, v3.h[0]\n"
+ "fmla v19.8h, v28.8h, v2.h[0]\n"
+ "fmla v23.8h, v28.8h, v1.h[0]\n"
+ "fmla v27.8h, v28.8h, v0.h[0]\n"
"cbnz x27, 225b\n"
"226:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3719,49 +3719,49 @@ void a64_hybrid_fp16_mla_6x32 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 227f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v29.8h }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.8h }, [x20]\n"
- "fmin v8.8h, v8.8h, v1.8h\n"
- "fmin v9.8h, v9.8h, v1.8h\n"
- "fmin v10.8h, v10.8h, v1.8h\n"
- "fmin v11.8h, v11.8h, v1.8h\n"
- "fmin v12.8h, v12.8h, v1.8h\n"
- "fmin v13.8h, v13.8h, v1.8h\n"
- "fmin v14.8h, v14.8h, v1.8h\n"
- "fmin v15.8h, v15.8h, v1.8h\n"
- "fmin v16.8h, v16.8h, v1.8h\n"
- "fmin v17.8h, v17.8h, v1.8h\n"
- "fmin v18.8h, v18.8h, v1.8h\n"
- "fmin v19.8h, v19.8h, v1.8h\n"
- "fmin v20.8h, v20.8h, v1.8h\n"
- "fmin v21.8h, v21.8h, v1.8h\n"
- "fmin v22.8h, v22.8h, v1.8h\n"
- "fmin v23.8h, v23.8h, v1.8h\n"
- "fmin v24.8h, v24.8h, v1.8h\n"
- "fmin v25.8h, v25.8h, v1.8h\n"
- "fmin v26.8h, v26.8h, v1.8h\n"
- "fmin v27.8h, v27.8h, v1.8h\n"
- "fmax v8.8h, v8.8h, v0.8h\n"
- "fmax v9.8h, v9.8h, v0.8h\n"
- "fmax v10.8h, v10.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v0.8h\n"
- "fmax v12.8h, v12.8h, v0.8h\n"
- "fmax v13.8h, v13.8h, v0.8h\n"
- "fmax v14.8h, v14.8h, v0.8h\n"
- "fmax v15.8h, v15.8h, v0.8h\n"
- "fmax v16.8h, v16.8h, v0.8h\n"
- "fmax v17.8h, v17.8h, v0.8h\n"
- "fmax v18.8h, v18.8h, v0.8h\n"
- "fmax v19.8h, v19.8h, v0.8h\n"
- "fmax v20.8h, v20.8h, v0.8h\n"
- "fmax v21.8h, v21.8h, v0.8h\n"
- "fmax v22.8h, v22.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v0.8h\n"
- "fmax v24.8h, v24.8h, v0.8h\n"
- "fmax v25.8h, v25.8h, v0.8h\n"
- "fmax v26.8h, v26.8h, v0.8h\n"
- "fmax v27.8h, v27.8h, v0.8h\n"
+ "ld1r { v28.8h }, [x20]\n"
+ "fmin v8.8h, v8.8h, v29.8h\n"
+ "fmin v9.8h, v9.8h, v29.8h\n"
+ "fmin v10.8h, v10.8h, v29.8h\n"
+ "fmin v11.8h, v11.8h, v29.8h\n"
+ "fmin v12.8h, v12.8h, v29.8h\n"
+ "fmin v13.8h, v13.8h, v29.8h\n"
+ "fmin v14.8h, v14.8h, v29.8h\n"
+ "fmin v15.8h, v15.8h, v29.8h\n"
+ "fmin v16.8h, v16.8h, v29.8h\n"
+ "fmin v17.8h, v17.8h, v29.8h\n"
+ "fmin v18.8h, v18.8h, v29.8h\n"
+ "fmin v19.8h, v19.8h, v29.8h\n"
+ "fmin v20.8h, v20.8h, v29.8h\n"
+ "fmin v21.8h, v21.8h, v29.8h\n"
+ "fmin v22.8h, v22.8h, v29.8h\n"
+ "fmin v23.8h, v23.8h, v29.8h\n"
+ "fmin v24.8h, v24.8h, v29.8h\n"
+ "fmin v25.8h, v25.8h, v29.8h\n"
+ "fmin v26.8h, v26.8h, v29.8h\n"
+ "fmin v27.8h, v27.8h, v29.8h\n"
+ "fmax v8.8h, v8.8h, v28.8h\n"
+ "fmax v9.8h, v9.8h, v28.8h\n"
+ "fmax v10.8h, v10.8h, v28.8h\n"
+ "fmax v11.8h, v11.8h, v28.8h\n"
+ "fmax v12.8h, v12.8h, v28.8h\n"
+ "fmax v13.8h, v13.8h, v28.8h\n"
+ "fmax v14.8h, v14.8h, v28.8h\n"
+ "fmax v15.8h, v15.8h, v28.8h\n"
+ "fmax v16.8h, v16.8h, v28.8h\n"
+ "fmax v17.8h, v17.8h, v28.8h\n"
+ "fmax v18.8h, v18.8h, v28.8h\n"
+ "fmax v19.8h, v19.8h, v28.8h\n"
+ "fmax v20.8h, v20.8h, v28.8h\n"
+ "fmax v21.8h, v21.8h, v28.8h\n"
+ "fmax v22.8h, v22.8h, v28.8h\n"
+ "fmax v23.8h, v23.8h, v28.8h\n"
+ "fmax v24.8h, v24.8h, v28.8h\n"
+ "fmax v25.8h, v25.8h, v28.8h\n"
+ "fmax v26.8h, v26.8h, v28.8h\n"
+ "fmax v27.8h, v27.8h, v28.8h\n"
"227:" // Height 5: No activation
"cmp x11, #0x20\n"
"bge 244f\n"
@@ -4386,16 +4386,16 @@ void a64_hybrid_fp16_mla_6x32 (
"268:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 269f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 270f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -4407,11 +4407,11 @@ void a64_hybrid_fp16_mla_6x32 (
"b 270f\n"
"269:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"270:" // Height 6: input setup done
"cmp x27, #0x8\n"
"blt 273f\n"
@@ -4912,42 +4912,42 @@ void a64_hybrid_fp16_mla_6x32 (
"273:" // Height 6: Multiply loop: Main loop skip
"cbz x27, 275f\n"
"274:" // Height 6: Multiply loop: Odd block loop
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x25], #0x2\n"
+ "ldr h7, [x26], #0x2\n"
+ "ldr h6, [x25], #0x2\n"
"sub x27, x27, #0x1\n"
- "ldr h2, [x24], #0x2\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "ldr h5, [x21], #0x2\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v8.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[0]\n"
- "fmla v16.8h, v6.8h, v2.h[0]\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "fmla v28.8h, v6.8h, v5.h[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
- "fmla v21.8h, v7.8h, v3.h[0]\n"
- "fmla v25.8h, v7.8h, v4.h[0]\n"
- "fmla v29.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr h5, [x24], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "fmla v8.8h, v1.8h, v7.h[0]\n"
+ "fmla v12.8h, v1.8h, v6.h[0]\n"
+ "fmla v16.8h, v1.8h, v5.h[0]\n"
+ "fmla v20.8h, v1.8h, v4.h[0]\n"
+ "fmla v24.8h, v1.8h, v3.h[0]\n"
+ "fmla v28.8h, v1.8h, v2.h[0]\n"
+ "ldr q1, [x10, #0x20]\n"
+ "fmla v9.8h, v0.8h, v7.h[0]\n"
+ "fmla v13.8h, v0.8h, v6.h[0]\n"
+ "fmla v17.8h, v0.8h, v5.h[0]\n"
+ "fmla v21.8h, v0.8h, v4.h[0]\n"
+ "fmla v25.8h, v0.8h, v3.h[0]\n"
+ "fmla v29.8h, v0.8h, v2.h[0]\n"
+ "ldr q0, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v14.8h, v6.8h, v1.h[0]\n"
- "fmla v18.8h, v6.8h, v2.h[0]\n"
- "fmla v22.8h, v6.8h, v3.h[0]\n"
- "fmla v26.8h, v6.8h, v4.h[0]\n"
- "fmla v30.8h, v6.8h, v5.h[0]\n"
- "fmla v11.8h, v7.8h, v0.h[0]\n"
- "fmla v15.8h, v7.8h, v1.h[0]\n"
- "fmla v19.8h, v7.8h, v2.h[0]\n"
- "fmla v23.8h, v7.8h, v3.h[0]\n"
- "fmla v27.8h, v7.8h, v4.h[0]\n"
- "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "fmla v10.8h, v1.8h, v7.h[0]\n"
+ "fmla v14.8h, v1.8h, v6.h[0]\n"
+ "fmla v18.8h, v1.8h, v5.h[0]\n"
+ "fmla v22.8h, v1.8h, v4.h[0]\n"
+ "fmla v26.8h, v1.8h, v3.h[0]\n"
+ "fmla v30.8h, v1.8h, v2.h[0]\n"
+ "fmla v11.8h, v0.8h, v7.h[0]\n"
+ "fmla v15.8h, v0.8h, v6.h[0]\n"
+ "fmla v19.8h, v0.8h, v5.h[0]\n"
+ "fmla v23.8h, v0.8h, v4.h[0]\n"
+ "fmla v27.8h, v0.8h, v3.h[0]\n"
+ "fmla v31.8h, v0.8h, v2.h[0]\n"
"cbnz x27, 274b\n"
"275:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -5317,7 +5317,6 @@ void a64_hybrid_fp16_mla_6x32 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"296:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
index e155bfb111..171929e65e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -113,5 +113,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
index 700d803f82..9ceda8fd0c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_4x24_a55 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 124f\n"
@@ -223,11 +222,11 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"19:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x11, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
"cbnz x13, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x11, x11, x20, LSL #2\n"
@@ -246,176 +245,176 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"blt 23f\n"
"22:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr d4, [x15, #0x40]\n"
- "ldr x10, [x15, #0x48]\n"
+ "ldr d19, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr d5, [x15, #0x50]\n"
+ "ldr d18, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr d6, [x15, #0x60]\n"
+ "ldr d17, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v4.d[1], x10\n"
- "ldr x9, [x15, #0x58]\n"
- "mov v5.d[1], x9\n"
- "ldr x28, [x15, #0x68]\n"
- "mov v6.d[1], x28\n"
- "ldr x27, [x15, #0x78]\n"
- "mov v7.d[1], x27\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr d4, [x15, #0x80]\n"
- "ldr x10, [x15, #0x88]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr d5, [x15, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr d6, [x15, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v4.d[1], x10\n"
- "ldr x9, [x15, #0x98]\n"
- "mov v5.d[1], x9\n"
- "ldr x28, [x15, #0xa8]\n"
- "mov v6.d[1], x28\n"
- "ldr x27, [x15, #0xb8]\n"
- "mov v7.d[1], x27\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "ldr d4, [x15, #0xc0]\n"
- "ldr x10, [x15, #0xc8]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr d5, [x15, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "ldr d6, [x15, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v4.d[1], x10\n"
- "ldr x9, [x15, #0xd8]\n"
- "mov v5.d[1], x9\n"
- "ldr x28, [x15, #0xe8]\n"
- "mov v6.d[1], x28\n"
- "ldr x27, [x15, #0xf8]\n"
- "mov v7.d[1], x27\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "ldr d4, [x15, #0x100]\n"
- "ldr x10, [x15, #0x108]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr d5, [x15, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr d6, [x15, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr d7, [x15, #0x130]\n"
- "mov v4.d[1], x10\n"
- "ldr x9, [x15, #0x118]\n"
- "mov v5.d[1], x9\n"
- "ldr x28, [x15, #0x128]\n"
- "mov v6.d[1], x28\n"
- "ldr x27, [x15, #0x138]\n"
- "mov v7.d[1], x27\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "ldr d4, [x15, #0x140]\n"
- "ldr x10, [x15, #0x148]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr d5, [x15, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr d6, [x15, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr d7, [x15, #0x170]\n"
- "mov v4.d[1], x10\n"
- "ldr x9, [x15, #0x158]\n"
- "mov v5.d[1], x9\n"
- "ldr x28, [x15, #0x168]\n"
- "mov v6.d[1], x28\n"
- "ldr x27, [x15, #0x178]\n"
- "mov v7.d[1], x27\n"
+ "ldr d16, [x15, #0x70]\n"
+ "mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0x68]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x78]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v12.4s, v19.4s, v0.s[0]\n"
+ "ldr d19, [x15, #0x80]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v13.4s, v18.4s, v0.s[0]\n"
+ "ldr d18, [x15, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr d17, [x15, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr d16, [x15, #0xb0]\n"
+ "mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.4s, v19.4s, v0.s[1]\n"
+ "ldr d19, [x15, #0xc0]\n"
+ "ldr x20, [x15, #0xc8]\n"
+ "fmla v11.4s, v18.4s, v0.s[1]\n"
+ "ldr d18, [x15, #0xd0]\n"
+ "fmla v12.4s, v17.4s, v0.s[1]\n"
+ "ldr d17, [x15, #0xe0]\n"
+ "fmla v13.4s, v16.4s, v0.s[1]\n"
+ "ldr d16, [x15, #0xf0]\n"
+ "mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0xe8]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v19.4s, v0.s[2]\n"
+ "ldr d19, [x15, #0x100]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v9.4s, v18.4s, v0.s[2]\n"
+ "ldr d18, [x15, #0x110]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr d17, [x15, #0x120]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr d16, [x15, #0x130]\n"
+ "mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0x118]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0x128]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x138]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "ldr d19, [x15, #0x140]\n"
+ "ldr x20, [x15, #0x148]\n"
+ "fmla v13.4s, v18.4s, v0.s[2]\n"
+ "ldr d18, [x15, #0x150]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr d17, [x15, #0x160]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr d16, [x15, #0x170]\n"
+ "mov v19.d[1], x20\n"
+ "ldr x20, [x15, #0x158]\n"
+ "mov v18.d[1], x20\n"
+ "ldr x20, [x15, #0x168]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x178]\n"
+ "mov v16.d[1], x20\n"
"add x11, x11, #0x10\n"
"add x15, x15, #0x180\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
+ "fmla v10.4s, v19.4s, v0.s[3]\n"
"ldr d4, [x15, #0x0]\n"
- "ldr x10, [x15, #0x8]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
+ "ldr x20, [x15, #0x8]\n"
+ "fmla v11.4s, v18.4s, v0.s[3]\n"
"ldr d5, [x15, #0x10]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v0.s[3]\n"
"ldr d6, [x15, #0x20]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v0.s[3]\n"
"ldr d0, [x11, #0x0]\n"
"sub x12, x12, #0x4\n"
"ldr d7, [x15, #0x30]\n"
"cmp x12, #0x8\n"
- "ldr x9, [x15, #0x18]\n"
- "mov v4.d[1], x10\n"
- "ldr x28, [x15, #0x28]\n"
- "mov v5.d[1], x9\n"
- "ldr x26, [x11, #0x8]\n"
- "mov v6.d[1], x28\n"
- "ldr x27, [x15, #0x38]\n"
- "mov v0.d[1], x26\n"
- "mov v7.d[1], x27\n"
+ "ldr x21, [x15, #0x18]\n"
+ "mov v4.d[1], x20\n"
+ "ldr x20, [x15, #0x28]\n"
+ "mov v5.d[1], x21\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x15, #0x38]\n"
+ "mov v0.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"prfm pldl1keep, [x11, #0x80]\n"
"bge 22b\n"
"23:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
+ "ldr q19, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
+ "ldr q18, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q17, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x70]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr q4, [x15, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x15, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x15, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "ldr q4, [x15, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr q5, [x15, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x15, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x15, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "ldr q4, [x15, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr q5, [x15, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x15, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x15, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "ldr q4, [x15, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr q5, [x15, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x15, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q16, [x15, #0x70]\n"
+ "fmla v12.4s, v19.4s, v0.s[0]\n"
+ "ldr q19, [x15, #0x80]\n"
+ "fmla v13.4s, v18.4s, v0.s[0]\n"
+ "ldr q18, [x15, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x15, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x15, #0xb0]\n"
+ "fmla v10.4s, v19.4s, v0.s[1]\n"
+ "ldr q19, [x15, #0xc0]\n"
+ "fmla v11.4s, v18.4s, v0.s[1]\n"
+ "ldr q18, [x15, #0xd0]\n"
+ "fmla v12.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x15, #0xe0]\n"
+ "fmla v13.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x15, #0xf0]\n"
+ "fmla v8.4s, v19.4s, v0.s[2]\n"
+ "ldr q19, [x15, #0x100]\n"
+ "fmla v9.4s, v18.4s, v0.s[2]\n"
+ "ldr q18, [x15, #0x110]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x15, #0x120]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x15, #0x130]\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "ldr q19, [x15, #0x140]\n"
+ "fmla v13.4s, v18.4s, v0.s[2]\n"
+ "ldr q18, [x15, #0x150]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x15, #0x160]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x15, #0x170]\n"
"add x11, x11, #0x10\n"
"sub x12, x12, #0x4\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
+ "fmla v10.4s, v19.4s, v0.s[3]\n"
"prfm pldl1keep, [x11, #0x80]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
+ "fmla v11.4s, v18.4s, v0.s[3]\n"
"add x15, x15, #0x180\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v0.s[3]\n"
"24:" // Height 1: Multiply loop: Main loop skip
"cbz x12, 26f\n"
"25:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
+ "ldr s17, [x11], #0x4\n"
"sub x12, x12, #0x1\n"
- "ldr q4, [x15, #0x0]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x10]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v8.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x10]\n"
+ "fmla v9.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x20]\n"
+ "fmla v10.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ "fmla v11.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x40]\n"
+ "fmla v12.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x15, #0x50]\n"
+ "fmla v13.4s, v16.4s, v17.s[0]\n"
"add x15, x15, #0x60\n"
"cbnz x12, 25b\n"
"26:" // Height 1: Multiply loop: No odd multiplies
@@ -426,21 +425,21 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"prfm pstl1keep, [x14, #0x0]\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v16.4s\n"
+ "fmin v9.4s, v9.4s, v16.4s\n"
+ "fmin v10.4s, v10.4s, v16.4s\n"
+ "fmin v11.4s, v11.4s, v16.4s\n"
+ "fmin v12.4s, v12.4s, v16.4s\n"
+ "fmin v13.4s, v13.4s, v16.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
+ "fmax v12.4s, v12.4s, v16.4s\n"
+ "fmax v13.4s, v13.4s, v16.4s\n"
"27:" // Height 1: No activation
"cmp x16, #0x18\n"
"bge 40f\n"
@@ -701,26 +700,26 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"60:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
- "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x11, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x10, [x20, #0x8]\n"
"cbnz x13, 62f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x11, x11, x20, LSL #2\n"
- "add x25, x25, x20, LSL #2\n"
+ "add x10, x10, x20, LSL #2\n"
"b 62f\n"
"61:" // Height 2: setup direct input
"mov x11, %x[input_ptr]\n"
- "add x25, x11, x20, LSL #2\n"
+ "add x10, x11, x21, LSL #2\n"
"62:" // Height 2: input setup done
"cmp x12, #0x4\n"
"blt 65f\n"
"ldr q0, [x11, #0x0]\n"
"cmp x12, #0x8\n"
- "ldr q1, [x25, #0x0]\n"
+ "ldr q1, [x10, #0x0]\n"
"ldr q4, [x15, #0x0]\n"
"ldr q5, [x15, #0x10]\n"
"ldr q6, [x15, #0x20]\n"
@@ -728,239 +727,239 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"blt 64f\n"
"63:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr x10, [x15, #0x48]\n"
+ "ldr x23, [x15, #0x48]\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr d4, [x15, #0x40]\n"
+ "ldr d23, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr x9, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x58]\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr d5, [x15, #0x50]\n"
+ "ldr d22, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr x28, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x68]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "ldr d6, [x15, #0x60]\n"
+ "ldr d21, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr x27, [x15, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v4.d[1], x10\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "mov v5.d[1], x9\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "ldr d4, [x15, #0x80]\n"
- "mov v6.d[1], x28\n"
- "mov v7.d[1], x27\n"
- "ldr x10, [x15, #0x88]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr x9, [x15, #0x98]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr d5, [x15, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr x28, [x15, #0xa8]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr d6, [x15, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr x27, [x15, #0xb8]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v4.d[1], x10\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "mov v5.d[1], x9\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr d4, [x15, #0xc0]\n"
- "mov v6.d[1], x28\n"
- "mov v7.d[1], x27\n"
- "ldr x10, [x15, #0xc8]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr x9, [x15, #0xd8]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "ldr d5, [x15, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "ldr x28, [x15, #0xe8]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "ldr d6, [x15, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "ldr x27, [x15, #0xf8]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v4.d[1], x10\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "mov v5.d[1], x9\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "ldr d4, [x15, #0x100]\n"
- "mov v6.d[1], x28\n"
- "mov v7.d[1], x27\n"
- "ldr x10, [x15, #0x108]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr x9, [x15, #0x118]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "ldr d5, [x15, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr x28, [x15, #0x128]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "ldr d6, [x15, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr x27, [x15, #0x138]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "ldr d7, [x15, #0x130]\n"
- "mov v4.d[1], x10\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "mov v5.d[1], x9\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "ldr d4, [x15, #0x140]\n"
- "mov v6.d[1], x28\n"
- "mov v7.d[1], x27\n"
- "ldr x10, [x15, #0x148]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr x9, [x15, #0x158]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "ldr d5, [x15, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr x28, [x15, #0x168]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr d6, [x15, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr x27, [x15, #0x178]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr d7, [x15, #0x170]\n"
- "mov v4.d[1], x10\n"
+ "ldr d20, [x15, #0x70]\n"
+ "mov v23.d[1], x23\n"
+ "fmla v12.4s, v23.4s, v0.s[0]\n"
+ "mov v22.d[1], x22\n"
+ "fmla v18.4s, v23.4s, v1.s[0]\n"
+ "ldr d23, [x15, #0x80]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0x88]\n"
+ "fmla v13.4s, v22.4s, v0.s[0]\n"
+ "ldr x22, [x15, #0x98]\n"
+ "fmla v19.4s, v22.4s, v1.s[0]\n"
+ "ldr d22, [x15, #0x90]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "ldr x21, [x15, #0xa8]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "ldr d21, [x15, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "ldr d20, [x15, #0xb0]\n"
+ "mov v23.d[1], x23\n"
+ "fmla v10.4s, v23.4s, v0.s[1]\n"
+ "mov v22.d[1], x22\n"
+ "fmla v16.4s, v23.4s, v1.s[1]\n"
+ "ldr d23, [x15, #0xc0]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0xc8]\n"
+ "fmla v11.4s, v22.4s, v0.s[1]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v17.4s, v22.4s, v1.s[1]\n"
+ "ldr d22, [x15, #0xd0]\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "ldr x21, [x15, #0xe8]\n"
+ "fmla v18.4s, v21.4s, v1.s[1]\n"
+ "ldr d21, [x15, #0xe0]\n"
+ "fmla v13.4s, v20.4s, v0.s[1]\n"
+ "ldr x20, [x15, #0xf8]\n"
+ "fmla v19.4s, v20.4s, v1.s[1]\n"
+ "ldr d20, [x15, #0xf0]\n"
+ "mov v23.d[1], x23\n"
+ "fmla v8.4s, v23.4s, v0.s[2]\n"
+ "mov v22.d[1], x22\n"
+ "fmla v14.4s, v23.4s, v1.s[2]\n"
+ "ldr d23, [x15, #0x100]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0x108]\n"
+ "fmla v9.4s, v22.4s, v0.s[2]\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v15.4s, v22.4s, v1.s[2]\n"
+ "ldr d22, [x15, #0x110]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "ldr x21, [x15, #0x128]\n"
+ "fmla v16.4s, v21.4s, v1.s[2]\n"
+ "ldr d21, [x15, #0x120]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "ldr x20, [x15, #0x138]\n"
+ "fmla v17.4s, v20.4s, v1.s[2]\n"
+ "ldr d20, [x15, #0x130]\n"
+ "mov v23.d[1], x23\n"
+ "fmla v12.4s, v23.4s, v0.s[2]\n"
+ "mov v22.d[1], x22\n"
+ "fmla v18.4s, v23.4s, v1.s[2]\n"
+ "ldr d23, [x15, #0x140]\n"
+ "mov v21.d[1], x21\n"
+ "mov v20.d[1], x20\n"
+ "ldr x23, [x15, #0x148]\n"
+ "fmla v13.4s, v22.4s, v0.s[2]\n"
+ "ldr x22, [x15, #0x158]\n"
+ "fmla v19.4s, v22.4s, v1.s[2]\n"
+ "ldr d22, [x15, #0x150]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "ldr x21, [x15, #0x168]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "ldr d21, [x15, #0x160]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "ldr x20, [x15, #0x178]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "ldr d20, [x15, #0x170]\n"
+ "mov v23.d[1], x23\n"
"add x11, x11, #0x10\n"
- "mov v5.d[1], x9\n"
- "add x25, x25, #0x10\n"
- "mov v6.d[1], x28\n"
+ "mov v22.d[1], x22\n"
+ "add x10, x10, #0x10\n"
+ "mov v21.d[1], x21\n"
"add x15, x15, #0x180\n"
- "mov v7.d[1], x27\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.4s, v23.4s, v0.s[3]\n"
+ "fmla v16.4s, v23.4s, v1.s[3]\n"
"ldr d4, [x15, #0x0]\n"
- "ldr x10, [x15, #0x8]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
+ "ldr x21, [x15, #0x8]\n"
+ "fmla v11.4s, v22.4s, v0.s[3]\n"
+ "fmla v17.4s, v22.4s, v1.s[3]\n"
"ldr d5, [x15, #0x10]\n"
- "ldr x9, [x15, #0x18]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
+ "ldr x20, [x15, #0x18]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v18.4s, v21.4s, v1.s[3]\n"
"ldr d6, [x15, #0x20]\n"
- "ldr x28, [x15, #0x28]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "ldr x23, [x15, #0x28]\n"
+ "fmla v13.4s, v20.4s, v0.s[3]\n"
"ldr d0, [x11, #0x0]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x25, #0x0]\n"
+ "fmla v19.4s, v20.4s, v1.s[3]\n"
+ "ldr d1, [x10, #0x0]\n"
"sub x12, x12, #0x4\n"
"ldr d7, [x15, #0x30]\n"
"cmp x12, #0x8\n"
- "ldr x26, [x11, #0x8]\n"
- "mov v4.d[1], x10\n"
- "ldr x24, [x25, #0x8]\n"
- "mov v5.d[1], x9\n"
- "ldr x27, [x15, #0x38]\n"
- "mov v6.d[1], x28\n"
+ "ldr x22, [x11, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "ldr x21, [x10, #0x8]\n"
+ "mov v5.d[1], x20\n"
+ "ldr x20, [x15, #0x38]\n"
+ "mov v6.d[1], x23\n"
"prfm pldl1keep, [x11, #0x80]\n"
- "mov v0.d[1], x26\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "mov v1.d[1], x24\n"
- "mov v7.d[1], x27\n"
+ "mov v0.d[1], x22\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "mov v1.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"bge 63b\n"
"64:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
"add x11, x11, #0x10\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
+ "ldr q23, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
+ "ldr q22, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"sub x12, x12, #0x4\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q21, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x15, #0x70]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "ldr q4, [x15, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr q5, [x15, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x15, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x15, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr q4, [x15, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "ldr q5, [x15, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x15, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x15, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "ldr q4, [x15, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "ldr q5, [x15, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x15, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x15, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "ldr q4, [x15, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "ldr q5, [x15, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x15, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x15, #0x170]\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
+ "ldr q20, [x15, #0x70]\n"
+ "fmla v12.4s, v23.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v18.4s, v23.4s, v1.s[0]\n"
+ "ldr q23, [x15, #0x80]\n"
+ "fmla v13.4s, v22.4s, v0.s[0]\n"
+ "fmla v19.4s, v22.4s, v1.s[0]\n"
+ "ldr q22, [x15, #0x90]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "ldr q21, [x15, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "ldr q20, [x15, #0xb0]\n"
+ "fmla v10.4s, v23.4s, v0.s[1]\n"
+ "fmla v16.4s, v23.4s, v1.s[1]\n"
+ "ldr q23, [x15, #0xc0]\n"
+ "fmla v11.4s, v22.4s, v0.s[1]\n"
+ "fmla v17.4s, v22.4s, v1.s[1]\n"
+ "ldr q22, [x15, #0xd0]\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "fmla v18.4s, v21.4s, v1.s[1]\n"
+ "ldr q21, [x15, #0xe0]\n"
+ "fmla v13.4s, v20.4s, v0.s[1]\n"
+ "fmla v19.4s, v20.4s, v1.s[1]\n"
+ "ldr q20, [x15, #0xf0]\n"
+ "fmla v8.4s, v23.4s, v0.s[2]\n"
+ "fmla v14.4s, v23.4s, v1.s[2]\n"
+ "ldr q23, [x15, #0x100]\n"
+ "fmla v9.4s, v22.4s, v0.s[2]\n"
+ "fmla v15.4s, v22.4s, v1.s[2]\n"
+ "ldr q22, [x15, #0x110]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v16.4s, v21.4s, v1.s[2]\n"
+ "ldr q21, [x15, #0x120]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "fmla v17.4s, v20.4s, v1.s[2]\n"
+ "ldr q20, [x15, #0x130]\n"
+ "fmla v12.4s, v23.4s, v0.s[2]\n"
+ "fmla v18.4s, v23.4s, v1.s[2]\n"
+ "ldr q23, [x15, #0x140]\n"
+ "fmla v13.4s, v22.4s, v0.s[2]\n"
+ "fmla v19.4s, v22.4s, v1.s[2]\n"
+ "ldr q22, [x15, #0x150]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "ldr q21, [x15, #0x160]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "ldr q20, [x15, #0x170]\n"
+ "fmla v10.4s, v23.4s, v0.s[3]\n"
"add x15, x15, #0x180\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
+ "fmla v16.4s, v23.4s, v1.s[3]\n"
+ "fmla v11.4s, v22.4s, v0.s[3]\n"
+ "fmla v17.4s, v22.4s, v1.s[3]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v18.4s, v21.4s, v1.s[3]\n"
+ "fmla v13.4s, v20.4s, v0.s[3]\n"
+ "fmla v19.4s, v20.4s, v1.s[3]\n"
"65:" // Height 2: Multiply loop: Main loop skip
"cbz x12, 67f\n"
"66:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
+ "ldr s25, [x11], #0x4\n"
"sub x12, x12, #0x1\n"
- "ldr s1, [x25], #0x4\n"
- "ldr q4, [x15, #0x0]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x10]\n"
- "fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
- "fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
- "fmla v16.4s, v6.4s, v1.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr s24, [x10], #0x4\n"
+ "ldr q21, [x15, #0x0]\n"
+ "fmla v8.4s, v21.4s, v25.s[0]\n"
+ "ldr q20, [x15, #0x10]\n"
+ "fmla v14.4s, v21.4s, v24.s[0]\n"
+ "ldr q23, [x15, #0x20]\n"
+ "fmla v9.4s, v20.4s, v25.s[0]\n"
+ "ldr q22, [x15, #0x30]\n"
+ "fmla v15.4s, v20.4s, v24.s[0]\n"
+ "ldr q21, [x15, #0x40]\n"
+ "fmla v10.4s, v23.4s, v25.s[0]\n"
+ "ldr q20, [x15, #0x50]\n"
+ "fmla v16.4s, v23.4s, v24.s[0]\n"
+ "fmla v11.4s, v22.4s, v25.s[0]\n"
"add x15, x15, #0x60\n"
- "fmla v17.4s, v7.4s, v1.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
+ "fmla v17.4s, v22.4s, v24.s[0]\n"
+ "fmla v12.4s, v21.4s, v25.s[0]\n"
+ "fmla v18.4s, v21.4s, v24.s[0]\n"
+ "fmla v13.4s, v20.4s, v25.s[0]\n"
+ "fmla v19.4s, v20.4s, v24.s[0]\n"
"cbnz x12, 66b\n"
"67:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -973,33 +972,33 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 68f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
- "fmin v14.4s, v14.4s, v0.4s\n"
- "fmin v15.4s, v15.4s, v0.4s\n"
- "fmin v16.4s, v16.4s, v0.4s\n"
- "fmin v17.4s, v17.4s, v0.4s\n"
- "fmin v18.4s, v18.4s, v0.4s\n"
- "fmin v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v20.4s\n"
+ "fmin v9.4s, v9.4s, v20.4s\n"
+ "fmin v10.4s, v10.4s, v20.4s\n"
+ "fmin v11.4s, v11.4s, v20.4s\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v20.4s\n"
+ "fmax v9.4s, v9.4s, v20.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v20.4s\n"
+ "fmax v14.4s, v14.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v20.4s\n"
+ "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v20.4s\n"
+ "fmax v19.4s, v19.4s, v20.4s\n"
"68:" // Height 2: No activation
"cmp x16, #0x18\n"
"bge 81f\n"
@@ -1339,30 +1338,30 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"101:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 102f\n"
- "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x11, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x23, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x10, [x20, #0x8]\n"
+ "ldr x9, [x20, #0x10]\n"
"cbnz x13, 103f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x11, x11, x20, LSL #2\n"
- "add x25, x25, x20, LSL #2\n"
- "add x23, x23, x20, LSL #2\n"
+ "add x10, x10, x20, LSL #2\n"
+ "add x9, x9, x20, LSL #2\n"
"b 103f\n"
"102:" // Height 3: setup direct input
"mov x11, %x[input_ptr]\n"
- "add x25, x11, x20, LSL #2\n"
- "add x23, x25, x20, LSL #2\n"
+ "add x10, x11, x21, LSL #2\n"
+ "add x9, x10, x21, LSL #2\n"
"103:" // Height 3: input setup done
"cmp x12, #0x4\n"
"blt 106f\n"
"ldr q0, [x11, #0x0]\n"
"cmp x12, #0x8\n"
- "ldr q1, [x25, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q2, [x9, #0x0]\n"
"ldr q4, [x15, #0x0]\n"
"ldr q5, [x15, #0x10]\n"
"ldr q6, [x15, #0x20]\n"
@@ -1370,301 +1369,301 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"blt 105f\n"
"104:" // Height 3: Multiply loop: Main loop head
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr x10, [x15, #0x48]\n"
+ "ldr x23, [x15, #0x48]\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr x9, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x58]\n"
"fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr d4, [x15, #0x40]\n"
+ "ldr d29, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr x28, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x68]\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr x27, [x15, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr d5, [x15, #0x50]\n"
+ "ldr d28, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "mov v4.d[1], x10\n"
+ "mov v29.d[1], x23\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "mov v5.d[1], x9\n"
+ "mov v28.d[1], x22\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
- "ldr d6, [x15, #0x60]\n"
+ "ldr d27, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x28\n"
+ "mov v27.d[1], x21\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr x10, [x15, #0x88]\n"
+ "ldr x23, [x15, #0x88]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v7.d[1], x27\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "ldr x9, [x15, #0x98]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "ldr d4, [x15, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr x28, [x15, #0xa8]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr x27, [x15, #0xb8]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
- "ldr d5, [x15, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "mov v4.d[1], x10\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "mov v5.d[1], x9\n"
- "fmla v20.4s, v6.4s, v2.s[1]\n"
- "ldr d6, [x15, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x28\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x10, [x15, #0xc8]\n"
- "fmla v21.4s, v7.4s, v2.s[1]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v7.d[1], x27\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr x9, [x15, #0xd8]\n"
- "fmla v22.4s, v4.4s, v2.s[1]\n"
- "ldr d4, [x15, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr x28, [x15, #0xe8]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "ldr x27, [x15, #0xf8]\n"
- "fmla v23.4s, v5.4s, v2.s[1]\n"
- "ldr d5, [x15, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "mov v4.d[1], x10\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "mov v5.d[1], x9\n"
- "fmla v24.4s, v6.4s, v2.s[1]\n"
- "ldr d6, [x15, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x28\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "ldr x10, [x15, #0x108]\n"
- "fmla v25.4s, v7.4s, v2.s[1]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v7.d[1], x27\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "ldr x9, [x15, #0x118]\n"
- "fmla v20.4s, v4.4s, v2.s[2]\n"
- "ldr d4, [x15, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr x28, [x15, #0x128]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "ldr x27, [x15, #0x138]\n"
- "fmla v21.4s, v5.4s, v2.s[2]\n"
- "ldr d5, [x15, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "mov v4.d[1], x10\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "mov v5.d[1], x9\n"
- "fmla v22.4s, v6.4s, v2.s[2]\n"
- "ldr d6, [x15, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x28\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "ldr x10, [x15, #0x148]\n"
- "fmla v23.4s, v7.4s, v2.s[2]\n"
- "ldr d7, [x15, #0x130]\n"
- "mov v7.d[1], x27\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "ldr x9, [x15, #0x158]\n"
- "fmla v24.4s, v4.4s, v2.s[2]\n"
- "ldr d4, [x15, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr x28, [x15, #0x168]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "ldr x27, [x15, #0x178]\n"
- "fmla v25.4s, v5.4s, v2.s[2]\n"
- "ldr d5, [x15, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "mov v4.d[1], x10\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "mov v5.d[1], x9\n"
- "fmla v20.4s, v6.4s, v2.s[3]\n"
- "ldr d6, [x15, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "mov v6.d[1], x28\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr d26, [x15, #0x70]\n"
+ "mov v26.d[1], x20\n"
+ "fmla v12.4s, v29.4s, v0.s[0]\n"
+ "fmla v18.4s, v29.4s, v1.s[0]\n"
+ "ldr x22, [x15, #0x98]\n"
+ "fmla v24.4s, v29.4s, v2.s[0]\n"
+ "ldr d29, [x15, #0x80]\n"
+ "fmla v13.4s, v28.4s, v0.s[0]\n"
+ "ldr x21, [x15, #0xa8]\n"
+ "fmla v19.4s, v28.4s, v1.s[0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v25.4s, v28.4s, v2.s[0]\n"
+ "ldr d28, [x15, #0x90]\n"
+ "fmla v8.4s, v27.4s, v0.s[1]\n"
+ "mov v29.d[1], x23\n"
+ "fmla v14.4s, v27.4s, v1.s[1]\n"
+ "mov v28.d[1], x22\n"
+ "fmla v20.4s, v27.4s, v2.s[1]\n"
+ "ldr d27, [x15, #0xa0]\n"
+ "fmla v9.4s, v26.4s, v0.s[1]\n"
+ "mov v27.d[1], x21\n"
+ "fmla v15.4s, v26.4s, v1.s[1]\n"
+ "ldr x23, [x15, #0xc8]\n"
+ "fmla v21.4s, v26.4s, v2.s[1]\n"
+ "ldr d26, [x15, #0xb0]\n"
+ "mov v26.d[1], x20\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v16.4s, v29.4s, v1.s[1]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v22.4s, v29.4s, v2.s[1]\n"
+ "ldr d29, [x15, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "ldr x21, [x15, #0xe8]\n"
+ "fmla v17.4s, v28.4s, v1.s[1]\n"
+ "ldr x20, [x15, #0xf8]\n"
+ "fmla v23.4s, v28.4s, v2.s[1]\n"
+ "ldr d28, [x15, #0xd0]\n"
+ "fmla v12.4s, v27.4s, v0.s[1]\n"
+ "mov v29.d[1], x23\n"
+ "fmla v18.4s, v27.4s, v1.s[1]\n"
+ "mov v28.d[1], x22\n"
+ "fmla v24.4s, v27.4s, v2.s[1]\n"
+ "ldr d27, [x15, #0xe0]\n"
+ "fmla v13.4s, v26.4s, v0.s[1]\n"
+ "mov v27.d[1], x21\n"
+ "fmla v19.4s, v26.4s, v1.s[1]\n"
+ "ldr x23, [x15, #0x108]\n"
+ "fmla v25.4s, v26.4s, v2.s[1]\n"
+ "ldr d26, [x15, #0xf0]\n"
+ "mov v26.d[1], x20\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v20.4s, v29.4s, v2.s[2]\n"
+ "ldr d29, [x15, #0x100]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "ldr x21, [x15, #0x128]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "ldr x20, [x15, #0x138]\n"
+ "fmla v21.4s, v28.4s, v2.s[2]\n"
+ "ldr d28, [x15, #0x110]\n"
+ "fmla v10.4s, v27.4s, v0.s[2]\n"
+ "mov v29.d[1], x23\n"
+ "fmla v16.4s, v27.4s, v1.s[2]\n"
+ "mov v28.d[1], x22\n"
+ "fmla v22.4s, v27.4s, v2.s[2]\n"
+ "ldr d27, [x15, #0x120]\n"
+ "fmla v11.4s, v26.4s, v0.s[2]\n"
+ "mov v27.d[1], x21\n"
+ "fmla v17.4s, v26.4s, v1.s[2]\n"
+ "ldr x23, [x15, #0x148]\n"
+ "fmla v23.4s, v26.4s, v2.s[2]\n"
+ "ldr d26, [x15, #0x130]\n"
+ "mov v26.d[1], x20\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v18.4s, v29.4s, v1.s[2]\n"
+ "ldr x22, [x15, #0x158]\n"
+ "fmla v24.4s, v29.4s, v2.s[2]\n"
+ "ldr d29, [x15, #0x140]\n"
+ "fmla v13.4s, v28.4s, v0.s[2]\n"
+ "ldr x21, [x15, #0x168]\n"
+ "fmla v19.4s, v28.4s, v1.s[2]\n"
+ "ldr x20, [x15, #0x178]\n"
+ "fmla v25.4s, v28.4s, v2.s[2]\n"
+ "ldr d28, [x15, #0x150]\n"
+ "fmla v8.4s, v27.4s, v0.s[3]\n"
+ "mov v29.d[1], x23\n"
+ "fmla v14.4s, v27.4s, v1.s[3]\n"
+ "mov v28.d[1], x22\n"
+ "fmla v20.4s, v27.4s, v2.s[3]\n"
+ "ldr d27, [x15, #0x160]\n"
+ "fmla v9.4s, v26.4s, v0.s[3]\n"
+ "mov v27.d[1], x21\n"
+ "fmla v15.4s, v26.4s, v1.s[3]\n"
"add x11, x11, #0x10\n"
- "fmla v21.4s, v7.4s, v2.s[3]\n"
- "ldr d7, [x15, #0x170]\n"
- "mov v7.d[1], x27\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
+ "fmla v21.4s, v26.4s, v2.s[3]\n"
+ "ldr d26, [x15, #0x170]\n"
+ "mov v26.d[1], x20\n"
+ "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
"add x15, x15, #0x180\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "ldr x10, [x15, #0x8]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
- "ldr x9, [x15, #0x18]\n"
- "fmla v22.4s, v4.4s, v2.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "ldr x26, [x15, #0x8]\n"
+ "fmla v16.4s, v29.4s, v1.s[3]\n"
+ "ldr x25, [x15, #0x18]\n"
+ "fmla v22.4s, v29.4s, v2.s[3]\n"
"ldr d4, [x15, #0x0]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "ldr x28, [x15, #0x28]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
- "ldr x26, [x11, #0x8]\n"
- "fmla v23.4s, v5.4s, v2.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "ldr x24, [x15, #0x28]\n"
+ "fmla v17.4s, v28.4s, v1.s[3]\n"
+ "ldr x23, [x11, #0x8]\n"
+ "fmla v23.4s, v28.4s, v2.s[3]\n"
"ldr d5, [x15, #0x10]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "ldr x24, [x25, #0x8]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
- "ldr x22, [x23, #0x8]\n"
- "fmla v24.4s, v6.4s, v2.s[3]\n"
+ "fmla v12.4s, v27.4s, v0.s[3]\n"
+ "ldr x22, [x10, #0x8]\n"
+ "fmla v18.4s, v27.4s, v1.s[3]\n"
+ "ldr x21, [x9, #0x8]\n"
+ "fmla v24.4s, v27.4s, v2.s[3]\n"
"ldr d6, [x15, #0x20]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v26.4s, v0.s[3]\n"
"ldr d0, [x11, #0x0]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x25, #0x0]\n"
- "fmla v25.4s, v7.4s, v2.s[3]\n"
- "ldr d2, [x23, #0x0]\n"
+ "fmla v19.4s, v26.4s, v1.s[3]\n"
+ "ldr d1, [x10, #0x0]\n"
+ "fmla v25.4s, v26.4s, v2.s[3]\n"
+ "ldr d2, [x9, #0x0]\n"
"ldr d7, [x15, #0x30]\n"
"sub x12, x12, #0x4\n"
- "ldr x27, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
"cmp x12, #0x8\n"
"prfm pldl1keep, [x11, #0x80]\n"
- "mov v4.d[1], x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "mov v5.d[1], x9\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "mov v6.d[1], x28\n"
- "mov v0.d[1], x26\n"
- "mov v1.d[1], x24\n"
- "mov v2.d[1], x22\n"
- "mov v7.d[1], x27\n"
+ "mov v4.d[1], x26\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "mov v5.d[1], x25\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v6.d[1], x24\n"
+ "mov v0.d[1], x23\n"
+ "mov v1.d[1], x22\n"
+ "mov v2.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"bge 104b\n"
"105:" // Height 3: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
"add x11, x11, #0x10\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
+ "ldr q29, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
"sub x12, x12, #0x4\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
+ "ldr q28, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"prfm pldl1keep, [x11, #0x80]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q27, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x15, #0x70]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "ldr q4, [x15, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x15, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v20.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x15, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v21.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x15, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[1]\n"
- "ldr q4, [x15, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "fmla v23.4s, v5.4s, v2.s[1]\n"
- "ldr q5, [x15, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "fmla v24.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x15, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "fmla v25.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x15, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "fmla v20.4s, v4.4s, v2.s[2]\n"
- "ldr q4, [x15, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "fmla v21.4s, v5.4s, v2.s[2]\n"
- "ldr q5, [x15, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "fmla v22.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x15, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "fmla v23.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x15, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "fmla v24.4s, v4.4s, v2.s[2]\n"
- "ldr q4, [x15, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "fmla v25.4s, v5.4s, v2.s[2]\n"
- "ldr q5, [x15, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v20.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x15, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v21.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x15, #0x170]\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
+ "ldr q26, [x15, #0x70]\n"
+ "fmla v12.4s, v29.4s, v0.s[0]\n"
+ "fmla v18.4s, v29.4s, v1.s[0]\n"
+ "fmla v24.4s, v29.4s, v2.s[0]\n"
+ "ldr q29, [x15, #0x80]\n"
+ "fmla v13.4s, v28.4s, v0.s[0]\n"
+ "fmla v19.4s, v28.4s, v1.s[0]\n"
+ "fmla v25.4s, v28.4s, v2.s[0]\n"
+ "ldr q28, [x15, #0x90]\n"
+ "fmla v8.4s, v27.4s, v0.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[1]\n"
+ "fmla v20.4s, v27.4s, v2.s[1]\n"
+ "ldr q27, [x15, #0xa0]\n"
+ "fmla v9.4s, v26.4s, v0.s[1]\n"
+ "fmla v15.4s, v26.4s, v1.s[1]\n"
+ "fmla v21.4s, v26.4s, v2.s[1]\n"
+ "ldr q26, [x15, #0xb0]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v16.4s, v29.4s, v1.s[1]\n"
+ "fmla v22.4s, v29.4s, v2.s[1]\n"
+ "ldr q29, [x15, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v17.4s, v28.4s, v1.s[1]\n"
+ "fmla v23.4s, v28.4s, v2.s[1]\n"
+ "ldr q28, [x15, #0xd0]\n"
+ "fmla v12.4s, v27.4s, v0.s[1]\n"
+ "fmla v18.4s, v27.4s, v1.s[1]\n"
+ "fmla v24.4s, v27.4s, v2.s[1]\n"
+ "ldr q27, [x15, #0xe0]\n"
+ "fmla v13.4s, v26.4s, v0.s[1]\n"
+ "fmla v19.4s, v26.4s, v1.s[1]\n"
+ "fmla v25.4s, v26.4s, v2.s[1]\n"
+ "ldr q26, [x15, #0xf0]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v20.4s, v29.4s, v2.s[2]\n"
+ "ldr q29, [x15, #0x100]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v21.4s, v28.4s, v2.s[2]\n"
+ "ldr q28, [x15, #0x110]\n"
+ "fmla v10.4s, v27.4s, v0.s[2]\n"
+ "fmla v16.4s, v27.4s, v1.s[2]\n"
+ "fmla v22.4s, v27.4s, v2.s[2]\n"
+ "ldr q27, [x15, #0x120]\n"
+ "fmla v11.4s, v26.4s, v0.s[2]\n"
+ "fmla v17.4s, v26.4s, v1.s[2]\n"
+ "fmla v23.4s, v26.4s, v2.s[2]\n"
+ "ldr q26, [x15, #0x130]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v18.4s, v29.4s, v1.s[2]\n"
+ "fmla v24.4s, v29.4s, v2.s[2]\n"
+ "ldr q29, [x15, #0x140]\n"
+ "fmla v13.4s, v28.4s, v0.s[2]\n"
+ "fmla v19.4s, v28.4s, v1.s[2]\n"
+ "fmla v25.4s, v28.4s, v2.s[2]\n"
+ "ldr q28, [x15, #0x150]\n"
+ "fmla v8.4s, v27.4s, v0.s[3]\n"
+ "fmla v14.4s, v27.4s, v1.s[3]\n"
+ "fmla v20.4s, v27.4s, v2.s[3]\n"
+ "ldr q27, [x15, #0x160]\n"
+ "fmla v9.4s, v26.4s, v0.s[3]\n"
+ "fmla v15.4s, v26.4s, v1.s[3]\n"
+ "fmla v21.4s, v26.4s, v2.s[3]\n"
+ "ldr q26, [x15, #0x170]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
"add x15, x15, #0x180\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
- "fmla v22.4s, v4.4s, v2.s[3]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
- "fmla v23.4s, v5.4s, v2.s[3]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
- "fmla v24.4s, v6.4s, v2.s[3]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
- "fmla v25.4s, v7.4s, v2.s[3]\n"
+ "fmla v16.4s, v29.4s, v1.s[3]\n"
+ "fmla v22.4s, v29.4s, v2.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "fmla v17.4s, v28.4s, v1.s[3]\n"
+ "fmla v23.4s, v28.4s, v2.s[3]\n"
+ "fmla v12.4s, v27.4s, v0.s[3]\n"
+ "fmla v18.4s, v27.4s, v1.s[3]\n"
+ "fmla v24.4s, v27.4s, v2.s[3]\n"
+ "fmla v13.4s, v26.4s, v0.s[3]\n"
+ "fmla v19.4s, v26.4s, v1.s[3]\n"
+ "fmla v25.4s, v26.4s, v2.s[3]\n"
"106:" // Height 3: Multiply loop: Main loop skip
"cbz x12, 108f\n"
"107:" // Height 3: Multiply loop: Odd block loop
"ldr s0, [x11], #0x4\n"
"sub x12, x12, #0x1\n"
- "ldr s1, [x25], #0x4\n"
- "ldr s2, [x23], #0x4\n"
- "ldr q4, [x15, #0x0]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x10]\n"
- "fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
- "fmla v15.4s, v5.4s, v1.s[0]\n"
- "fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s31, [x10], #0x4\n"
+ "ldr s30, [x9], #0x4\n"
+ "ldr q27, [x15, #0x0]\n"
+ "fmla v8.4s, v27.4s, v0.s[0]\n"
+ "ldr q26, [x15, #0x10]\n"
+ "fmla v14.4s, v27.4s, v31.s[0]\n"
+ "ldr q29, [x15, #0x20]\n"
+ "fmla v20.4s, v27.4s, v30.s[0]\n"
+ "ldr q28, [x15, #0x30]\n"
+ "fmla v9.4s, v26.4s, v0.s[0]\n"
+ "ldr q27, [x15, #0x40]\n"
+ "fmla v15.4s, v26.4s, v31.s[0]\n"
+ "fmla v21.4s, v26.4s, v30.s[0]\n"
+ "ldr q26, [x15, #0x50]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
"add x15, x15, #0x60\n"
- "fmla v16.4s, v6.4s, v1.s[0]\n"
- "fmla v22.4s, v6.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v17.4s, v7.4s, v1.s[0]\n"
- "fmla v23.4s, v7.4s, v2.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
+ "fmla v16.4s, v29.4s, v31.s[0]\n"
+ "fmla v22.4s, v29.4s, v30.s[0]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v17.4s, v28.4s, v31.s[0]\n"
+ "fmla v23.4s, v28.4s, v30.s[0]\n"
+ "fmla v12.4s, v27.4s, v0.s[0]\n"
+ "fmla v18.4s, v27.4s, v31.s[0]\n"
+ "fmla v24.4s, v27.4s, v30.s[0]\n"
+ "fmla v13.4s, v26.4s, v0.s[0]\n"
+ "fmla v19.4s, v26.4s, v31.s[0]\n"
+ "fmla v25.4s, v26.4s, v30.s[0]\n"
"cbnz x12, 107b\n"
"108:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1679,45 +1678,45 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 109f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
- "fmin v14.4s, v14.4s, v0.4s\n"
- "fmin v15.4s, v15.4s, v0.4s\n"
- "fmin v16.4s, v16.4s, v0.4s\n"
- "fmin v17.4s, v17.4s, v0.4s\n"
- "fmin v18.4s, v18.4s, v0.4s\n"
- "fmin v19.4s, v19.4s, v0.4s\n"
- "fmin v20.4s, v20.4s, v0.4s\n"
- "fmin v21.4s, v21.4s, v0.4s\n"
- "fmin v22.4s, v22.4s, v0.4s\n"
- "fmin v23.4s, v23.4s, v0.4s\n"
- "fmin v24.4s, v24.4s, v0.4s\n"
- "fmin v25.4s, v25.4s, v0.4s\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v25.4s, v25.4s, v26.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v0.4s\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v26.4s\n"
+ "fmax v9.4s, v9.4s, v26.4s\n"
+ "fmax v10.4s, v10.4s, v26.4s\n"
+ "fmax v11.4s, v11.4s, v26.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
"109:" // Height 3: No activation
"cmp x16, #0x18\n"
"bge 122f\n"
@@ -2139,34 +2138,34 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"142:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w12, [x20, x13, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 143f\n"
- "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x11, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x23, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x10, [x20, #0x8]\n"
+ "ldr x9, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
"cbnz x13, 144f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x11, x11, x20, LSL #2\n"
- "add x25, x25, x20, LSL #2\n"
- "add x23, x23, x20, LSL #2\n"
- "add x21, x21, x20, LSL #2\n"
+ "add x10, x10, x20, LSL #2\n"
+ "add x9, x9, x20, LSL #2\n"
+ "add x28, x28, x20, LSL #2\n"
"b 144f\n"
"143:" // Height 4: setup direct input
"mov x11, %x[input_ptr]\n"
- "add x25, x11, x20, LSL #2\n"
- "add x23, x25, x20, LSL #2\n"
- "add x21, x23, x20, LSL #2\n"
+ "add x10, x11, x21, LSL #2\n"
+ "add x9, x10, x21, LSL #2\n"
+ "add x28, x9, x21, LSL #2\n"
"144:" // Height 4: input setup done
"cmp x12, #0x4\n"
"blt 147f\n"
"ldr q0, [x11, #0x0]\n"
"cmp x12, #0x8\n"
- "ldr q1, [x25, #0x0]\n"
- "ldr q2, [x23, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q2, [x9, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
"ldr q4, [x15, #0x0]\n"
"ldr q5, [x15, #0x10]\n"
"ldr q6, [x15, #0x20]\n"
@@ -2174,177 +2173,177 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"blt 146f\n"
"145:" // Height 4: Multiply loop: Main loop head
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr x10, [x15, #0x48]\n"
+ "ldr x23, [x15, #0x48]\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr x9, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x58]\n"
"fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr x28, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x68]\n"
"fmla v26.4s, v4.4s, v3.s[0]\n"
"ldr d4, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr x27, [x15, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "mov v4.d[1], x10\n"
+ "mov v4.d[1], x23\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr x10, [x15, #0x88]\n"
+ "ldr x23, [x15, #0x88]\n"
"fmla v27.4s, v5.4s, v3.s[0]\n"
"ldr d5, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "mov v5.d[1], x9\n"
+ "mov v5.d[1], x22\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "ldr x9, [x15, #0x98]\n"
+ "ldr x22, [x15, #0x98]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
"add x11, x11, #0x10\n"
"fmla v28.4s, v6.4s, v3.s[0]\n"
"ldr d6, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x28\n"
+ "mov v6.d[1], x21\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr x28, [x15, #0xa8]\n"
+ "ldr x21, [x15, #0xa8]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v29.4s, v7.4s, v3.s[0]\n"
"ldr d7, [x15, #0x70]\n"
- "mov v7.d[1], x27\n"
+ "mov v7.d[1], x20\n"
"fmla v12.4s, v4.4s, v0.s[0]\n"
"fmla v18.4s, v4.4s, v1.s[0]\n"
- "ldr x27, [x15, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v24.4s, v4.4s, v2.s[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v30.4s, v4.4s, v3.s[0]\n"
"ldr d4, [x15, #0x80]\n"
"fmla v13.4s, v5.4s, v0.s[0]\n"
- "mov v4.d[1], x10\n"
+ "mov v4.d[1], x23\n"
"fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr x10, [x15, #0xc8]\n"
+ "ldr x23, [x15, #0xc8]\n"
"fmla v25.4s, v5.4s, v2.s[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v31.4s, v5.4s, v3.s[0]\n"
"ldr d5, [x15, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
- "mov v5.d[1], x9\n"
+ "mov v5.d[1], x22\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr x9, [x15, #0xd8]\n"
+ "ldr x22, [x15, #0xd8]\n"
"fmla v20.4s, v6.4s, v2.s[1]\n"
- "ldr x26, [x11, #0x8]\n"
+ "ldr x27, [x11, #0x8]\n"
"fmla v26.4s, v6.4s, v3.s[1]\n"
"ldr d6, [x15, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x28\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x28, [x15, #0xe8]\n"
+ "ldr x21, [x15, #0xe8]\n"
"fmla v21.4s, v7.4s, v2.s[1]\n"
- "ldr x24, [x25, #0x8]\n"
+ "ldr x26, [x10, #0x8]\n"
"fmla v27.4s, v7.4s, v3.s[1]\n"
"ldr d7, [x15, #0xb0]\n"
- "mov v7.d[1], x27\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v4.4s, v0.s[1]\n"
"fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr x27, [x15, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v22.4s, v4.4s, v2.s[1]\n"
- "ldr x22, [x23, #0x8]\n"
+ "ldr x25, [x9, #0x8]\n"
"fmla v28.4s, v4.4s, v3.s[1]\n"
"ldr d4, [x15, #0xc0]\n"
"fmla v11.4s, v5.4s, v0.s[1]\n"
- "mov v4.d[1], x10\n"
+ "mov v4.d[1], x23\n"
"fmla v17.4s, v5.4s, v1.s[1]\n"
- "ldr x10, [x15, #0x108]\n"
+ "ldr x23, [x15, #0x108]\n"
"fmla v23.4s, v5.4s, v2.s[1]\n"
- "ldr x20, [x21, #0x8]\n"
+ "ldr x24, [x28, #0x8]\n"
"fmla v29.4s, v5.4s, v3.s[1]\n"
"ldr d5, [x15, #0xd0]\n"
"fmla v12.4s, v6.4s, v0.s[1]\n"
- "mov v5.d[1], x9\n"
+ "mov v5.d[1], x22\n"
"fmla v18.4s, v6.4s, v1.s[1]\n"
- "ldr x9, [x15, #0x118]\n"
+ "ldr x22, [x15, #0x118]\n"
"fmla v24.4s, v6.4s, v2.s[1]\n"
"sub x12, x12, #0x4\n"
"fmla v30.4s, v6.4s, v3.s[1]\n"
"ldr d6, [x15, #0xe0]\n"
"fmla v13.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x28\n"
+ "mov v6.d[1], x21\n"
"fmla v19.4s, v7.4s, v1.s[1]\n"
- "ldr x28, [x15, #0x128]\n"
+ "ldr x21, [x15, #0x128]\n"
"fmla v25.4s, v7.4s, v2.s[1]\n"
"cmp x12, #0x8\n"
"fmla v31.4s, v7.4s, v3.s[1]\n"
"ldr d7, [x15, #0xf0]\n"
- "mov v7.d[1], x27\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v4.4s, v0.s[2]\n"
"fmla v14.4s, v4.4s, v1.s[2]\n"
- "ldr x27, [x15, #0x138]\n"
+ "ldr x20, [x15, #0x138]\n"
"fmla v20.4s, v4.4s, v2.s[2]\n"
"prfm pldl1keep, [x11, #0x80]\n"
"fmla v26.4s, v4.4s, v3.s[2]\n"
"ldr d4, [x15, #0x100]\n"
"fmla v9.4s, v5.4s, v0.s[2]\n"
- "mov v4.d[1], x10\n"
+ "mov v4.d[1], x23\n"
"fmla v15.4s, v5.4s, v1.s[2]\n"
- "ldr x10, [x15, #0x148]\n"
+ "ldr x23, [x15, #0x148]\n"
"fmla v21.4s, v5.4s, v2.s[2]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v27.4s, v5.4s, v3.s[2]\n"
"ldr d5, [x15, #0x110]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
- "mov v5.d[1], x9\n"
+ "mov v5.d[1], x22\n"
"fmla v16.4s, v6.4s, v1.s[2]\n"
- "ldr x9, [x15, #0x158]\n"
+ "ldr x22, [x15, #0x158]\n"
"fmla v22.4s, v6.4s, v2.s[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v28.4s, v6.4s, v3.s[2]\n"
"ldr d6, [x15, #0x120]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x28\n"
+ "mov v6.d[1], x21\n"
"fmla v17.4s, v7.4s, v1.s[2]\n"
- "ldr x28, [x15, #0x168]\n"
+ "ldr x21, [x15, #0x168]\n"
"fmla v23.4s, v7.4s, v2.s[2]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v29.4s, v7.4s, v3.s[2]\n"
"ldr d7, [x15, #0x130]\n"
- "mov v7.d[1], x27\n"
+ "mov v7.d[1], x20\n"
"fmla v12.4s, v4.4s, v0.s[2]\n"
"fmla v18.4s, v4.4s, v1.s[2]\n"
- "ldr x27, [x15, #0x178]\n"
+ "ldr x20, [x15, #0x178]\n"
"fmla v24.4s, v4.4s, v2.s[2]\n"
"fmla v30.4s, v4.4s, v3.s[2]\n"
"ldr d4, [x15, #0x140]\n"
"fmla v13.4s, v5.4s, v0.s[2]\n"
- "mov v4.d[1], x10\n"
+ "mov v4.d[1], x23\n"
"fmla v19.4s, v5.4s, v1.s[2]\n"
"fmla v25.4s, v5.4s, v2.s[2]\n"
"fmla v31.4s, v5.4s, v3.s[2]\n"
"ldr d5, [x15, #0x150]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
- "mov v5.d[1], x9\n"
+ "mov v5.d[1], x22\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v20.4s, v6.4s, v2.s[3]\n"
"fmla v26.4s, v6.4s, v3.s[3]\n"
"ldr d6, [x15, #0x160]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
- "mov v6.d[1], x28\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
"fmla v21.4s, v7.4s, v2.s[3]\n"
"fmla v27.4s, v7.4s, v3.s[3]\n"
"ldr d7, [x15, #0x170]\n"
- "mov v7.d[1], x27\n"
+ "mov v7.d[1], x20\n"
"add x15, x15, #0x180\n"
"fmla v10.4s, v4.4s, v0.s[3]\n"
- "ldr x10, [x15, #0x8]\n"
+ "ldr x23, [x15, #0x8]\n"
"fmla v16.4s, v4.4s, v1.s[3]\n"
- "ldr x9, [x15, #0x18]\n"
+ "ldr x22, [x15, #0x18]\n"
"fmla v22.4s, v4.4s, v2.s[3]\n"
- "ldr x28, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
"fmla v28.4s, v4.4s, v3.s[3]\n"
"ldr d4, [x15, #0x0]\n"
"fmla v11.4s, v5.4s, v0.s[3]\n"
- "ldr x27, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
"fmla v17.4s, v5.4s, v1.s[3]\n"
- "mov v4.d[1], x10\n"
+ "mov v4.d[1], x23\n"
"fmla v23.4s, v5.4s, v2.s[3]\n"
"fmla v29.4s, v5.4s, v3.s[3]\n"
"ldr d5, [x15, #0x10]\n"
"fmla v12.4s, v6.4s, v0.s[3]\n"
- "mov v5.d[1], x9\n"
+ "mov v5.d[1], x22\n"
"fmla v18.4s, v6.4s, v1.s[3]\n"
"fmla v24.4s, v6.4s, v2.s[3]\n"
"fmla v30.4s, v6.4s, v3.s[3]\n"
@@ -2352,30 +2351,30 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v13.4s, v7.4s, v0.s[3]\n"
"ldr d0, [x11, #0x0]\n"
"fmla v19.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x25, #0x0]\n"
+ "ldr d1, [x10, #0x0]\n"
"fmla v25.4s, v7.4s, v2.s[3]\n"
- "ldr d2, [x23, #0x0]\n"
+ "ldr d2, [x9, #0x0]\n"
"fmla v31.4s, v7.4s, v3.s[3]\n"
- "ldr d3, [x21, #0x0]\n"
+ "ldr d3, [x28, #0x0]\n"
"ldr d7, [x15, #0x30]\n"
- "mov v6.d[1], x28\n"
- "mov v0.d[1], x26\n"
- "mov v1.d[1], x24\n"
- "mov v2.d[1], x22\n"
- "mov v3.d[1], x20\n"
- "mov v7.d[1], x27\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x26\n"
+ "mov v2.d[1], x25\n"
+ "mov v3.d[1], x24\n"
+ "mov v7.d[1], x20\n"
"bge 145b\n"
"146:" // Height 4: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
"add x11, x11, #0x10\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v20.4s, v4.4s, v2.s[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v26.4s, v4.4s, v3.s[0]\n"
"ldr q4, [x15, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
"sub x12, x12, #0x4\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
@@ -2383,11 +2382,11 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"fmla v27.4s, v5.4s, v3.s[0]\n"
"ldr q5, [x15, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v28.4s, v6.4s, v3.s[0]\n"
"ldr q6, [x15, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
@@ -2495,42 +2494,42 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"147:" // Height 4: Multiply loop: Main loop skip
"cbz x12, 149f\n"
"148:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
+ "ldr s7, [x11], #0x4\n"
"sub x12, x12, #0x1\n"
- "ldr s1, [x25], #0x4\n"
- "ldr s2, [x23], #0x4\n"
- "ldr s3, [x21], #0x4\n"
- "ldr q4, [x15, #0x0]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q5, [x15, #0x10]\n"
- "fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
- "fmla v26.4s, v4.4s, v3.s[0]\n"
- "ldr q4, [x15, #0x40]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v15.4s, v5.4s, v1.s[0]\n"
- "fmla v21.4s, v5.4s, v2.s[0]\n"
- "fmla v27.4s, v5.4s, v3.s[0]\n"
- "ldr q5, [x15, #0x50]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s6, [x10], #0x4\n"
+ "ldr s5, [x9], #0x4\n"
+ "ldr s4, [x28], #0x4\n"
+ "ldr q1, [x15, #0x0]\n"
+ "fmla v8.4s, v1.4s, v7.s[0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v14.4s, v1.4s, v6.s[0]\n"
+ "ldr q3, [x15, #0x20]\n"
+ "fmla v20.4s, v1.4s, v5.s[0]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v26.4s, v1.4s, v4.s[0]\n"
+ "ldr q1, [x15, #0x40]\n"
+ "fmla v9.4s, v0.4s, v7.s[0]\n"
+ "fmla v15.4s, v0.4s, v6.s[0]\n"
+ "fmla v21.4s, v0.4s, v5.s[0]\n"
+ "fmla v27.4s, v0.4s, v4.s[0]\n"
+ "ldr q0, [x15, #0x50]\n"
+ "fmla v10.4s, v3.4s, v7.s[0]\n"
"add x15, x15, #0x60\n"
- "fmla v16.4s, v6.4s, v1.s[0]\n"
- "fmla v22.4s, v6.4s, v2.s[0]\n"
- "fmla v28.4s, v6.4s, v3.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v17.4s, v7.4s, v1.s[0]\n"
- "fmla v23.4s, v7.4s, v2.s[0]\n"
- "fmla v29.4s, v7.4s, v3.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "fmla v30.4s, v4.4s, v3.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
- "fmla v31.4s, v5.4s, v3.s[0]\n"
+ "fmla v16.4s, v3.4s, v6.s[0]\n"
+ "fmla v22.4s, v3.4s, v5.s[0]\n"
+ "fmla v28.4s, v3.4s, v4.s[0]\n"
+ "fmla v11.4s, v2.4s, v7.s[0]\n"
+ "fmla v17.4s, v2.4s, v6.s[0]\n"
+ "fmla v23.4s, v2.4s, v5.s[0]\n"
+ "fmla v29.4s, v2.4s, v4.s[0]\n"
+ "fmla v12.4s, v1.4s, v7.s[0]\n"
+ "fmla v18.4s, v1.4s, v6.s[0]\n"
+ "fmla v24.4s, v1.4s, v5.s[0]\n"
+ "fmla v30.4s, v1.4s, v4.s[0]\n"
+ "fmla v13.4s, v0.4s, v7.s[0]\n"
+ "fmla v19.4s, v0.4s, v6.s[0]\n"
+ "fmla v25.4s, v0.4s, v5.s[0]\n"
+ "fmla v31.4s, v0.4s, v4.s[0]\n"
"cbnz x12, 148b\n"
"149:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2796,7 +2795,6 @@ void a64_hybrid_fp32_mla_4x24_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"166:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
index 5fb71c95b7..dbd45460e8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_4x24 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 124f\n"
@@ -223,11 +222,11 @@ void a64_hybrid_fp32_mla_4x24 (
"19:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -246,126 +245,126 @@ void a64_hybrid_fp32_mla_4x24 (
"blt 23f\n"
"22:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q19, [x28, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q18, [x28, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q17, [x28, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x28, #0x70]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr q4, [x28, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr q5, [x28, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x28, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x28, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "ldr q4, [x28, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr q5, [x28, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x28, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x28, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "ldr q4, [x28, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr q5, [x28, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x28, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x28, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "ldr q4, [x28, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr q5, [x28, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x28, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x28, #0x170]\n"
+ "ldr q16, [x28, #0x70]\n"
+ "fmla v12.4s, v19.4s, v0.s[0]\n"
+ "ldr q19, [x28, #0x80]\n"
+ "fmla v13.4s, v18.4s, v0.s[0]\n"
+ "ldr q18, [x28, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x28, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x28, #0xb0]\n"
+ "fmla v10.4s, v19.4s, v0.s[1]\n"
+ "ldr q19, [x28, #0xc0]\n"
+ "fmla v11.4s, v18.4s, v0.s[1]\n"
+ "ldr q18, [x28, #0xd0]\n"
+ "fmla v12.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x28, #0xe0]\n"
+ "fmla v13.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x28, #0xf0]\n"
+ "fmla v8.4s, v19.4s, v0.s[2]\n"
+ "ldr q19, [x28, #0x100]\n"
+ "fmla v9.4s, v18.4s, v0.s[2]\n"
+ "ldr q18, [x28, #0x110]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x28, #0x120]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x28, #0x130]\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "ldr q19, [x28, #0x140]\n"
+ "fmla v13.4s, v18.4s, v0.s[2]\n"
+ "ldr q18, [x28, #0x150]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x28, #0x160]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x28, #0x170]\n"
"sub x25, x25, #0x4\n"
"add x24, x24, #0x10\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
+ "fmla v10.4s, v19.4s, v0.s[3]\n"
+ "fmla v11.4s, v18.4s, v0.s[3]\n"
"cmp x25, #0x8\n"
"add x28, x28, #0x180\n"
"ldr q4, [x28, #0x0]\n"
"ldr q5, [x28, #0x10]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v0.s[3]\n"
"ldr q6, [x28, #0x20]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v0.s[3]\n"
"ldr q0, [x24, #0x0]\n"
"ldr q7, [x28, #0x30]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"bge 22b\n"
"23:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q19, [x28, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q18, [x28, #0x50]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q17, [x28, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x28, #0x70]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr q4, [x28, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "ldr q5, [x28, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x28, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x28, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "ldr q4, [x28, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "ldr q5, [x28, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x28, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x28, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "ldr q4, [x28, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "ldr q5, [x28, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x28, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x28, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "ldr q4, [x28, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "ldr q5, [x28, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x28, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x28, #0x170]\n"
+ "ldr q16, [x28, #0x70]\n"
+ "fmla v12.4s, v19.4s, v0.s[0]\n"
+ "ldr q19, [x28, #0x80]\n"
+ "fmla v13.4s, v18.4s, v0.s[0]\n"
+ "ldr q18, [x28, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x28, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x28, #0xb0]\n"
+ "fmla v10.4s, v19.4s, v0.s[1]\n"
+ "ldr q19, [x28, #0xc0]\n"
+ "fmla v11.4s, v18.4s, v0.s[1]\n"
+ "ldr q18, [x28, #0xd0]\n"
+ "fmla v12.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x28, #0xe0]\n"
+ "fmla v13.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x28, #0xf0]\n"
+ "fmla v8.4s, v19.4s, v0.s[2]\n"
+ "ldr q19, [x28, #0x100]\n"
+ "fmla v9.4s, v18.4s, v0.s[2]\n"
+ "ldr q18, [x28, #0x110]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x28, #0x120]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x28, #0x130]\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "ldr q19, [x28, #0x140]\n"
+ "fmla v13.4s, v18.4s, v0.s[2]\n"
+ "ldr q18, [x28, #0x150]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x28, #0x160]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x28, #0x170]\n"
"add x24, x24, #0x10\n"
"sub x25, x25, #0x4\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
+ "fmla v10.4s, v19.4s, v0.s[3]\n"
+ "fmla v11.4s, v18.4s, v0.s[3]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"add x28, x28, #0x180\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v0.s[3]\n"
"24:" // Height 1: Multiply loop: Main loop skip
"cbz x25, 26f\n"
"25:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "ldr q4, [x28, #0x0]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr q16, [x28, #0x0]\n"
+ "fmla v8.4s, v16.4s, v18.s[0]\n"
"sub x25, x25, #0x1\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q4, [x28, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
+ "ldr q17, [x28, #0x10]\n"
+ "ldr q16, [x28, #0x20]\n"
+ "fmla v9.4s, v17.4s, v18.s[0]\n"
+ "fmla v10.4s, v16.4s, v18.s[0]\n"
+ "ldr q17, [x28, #0x30]\n"
+ "ldr q16, [x28, #0x40]\n"
+ "fmla v11.4s, v17.4s, v18.s[0]\n"
+ "fmla v12.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x28, #0x50]\n"
+ "fmla v13.4s, v16.4s, v18.s[0]\n"
"add x28, x28, #0x60\n"
"cbnz x25, 25b\n"
"26:" // Height 1: Multiply loop: No odd multiplies
@@ -376,21 +375,21 @@ void a64_hybrid_fp32_mla_4x24 (
"prfm pstl1keep, [x27, #0x0]\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmin v12.4s, v12.4s, v17.4s\n"
+ "fmin v13.4s, v13.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
+ "fmax v12.4s, v12.4s, v16.4s\n"
+ "fmax v13.4s, v13.4s, v16.4s\n"
"27:" // Height 1: No activation
"cmp x9, #0x18\n"
"bge 40f\n"
@@ -651,12 +650,12 @@ void a64_hybrid_fp32_mla_4x24 (
"60:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 62f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -664,7 +663,7 @@ void a64_hybrid_fp32_mla_4x24 (
"b 62f\n"
"61:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"62:" // Height 2: input setup done
"cmp x25, #0x4\n"
"blt 65f\n"
@@ -679,186 +678,186 @@ void a64_hybrid_fp32_mla_4x24 (
"63:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v4.4s, v0.s[0]\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q23, [x28, #0x40]\n"
"sub x25, x25, #0x4\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q22, [x28, #0x50]\n"
"add x24, x24, #0x10\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q21, [x28, #0x60]\n"
"add x23, x23, #0x10\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x28, #0x70]\n"
+ "ldr q20, [x28, #0x70]\n"
"cmp x25, #0x8\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "ldr q4, [x28, #0x80]\n"
+ "fmla v12.4s, v23.4s, v0.s[0]\n"
+ "fmla v18.4s, v23.4s, v1.s[0]\n"
+ "ldr q23, [x28, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr q5, [x28, #0x90]\n"
+ "fmla v13.4s, v22.4s, v0.s[0]\n"
+ "fmla v19.4s, v22.4s, v1.s[0]\n"
+ "ldr q22, [x28, #0x90]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x28, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x28, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr q4, [x28, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "ldr q5, [x28, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x28, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x28, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "ldr q4, [x28, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "ldr q5, [x28, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x28, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x28, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "ldr q4, [x28, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "ldr q5, [x28, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x28, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x28, #0x170]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "ldr q21, [x28, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "ldr q20, [x28, #0xb0]\n"
+ "fmla v10.4s, v23.4s, v0.s[1]\n"
+ "fmla v16.4s, v23.4s, v1.s[1]\n"
+ "ldr q23, [x28, #0xc0]\n"
+ "fmla v11.4s, v22.4s, v0.s[1]\n"
+ "fmla v17.4s, v22.4s, v1.s[1]\n"
+ "ldr q22, [x28, #0xd0]\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "fmla v18.4s, v21.4s, v1.s[1]\n"
+ "ldr q21, [x28, #0xe0]\n"
+ "fmla v13.4s, v20.4s, v0.s[1]\n"
+ "fmla v19.4s, v20.4s, v1.s[1]\n"
+ "ldr q20, [x28, #0xf0]\n"
+ "fmla v8.4s, v23.4s, v0.s[2]\n"
+ "fmla v14.4s, v23.4s, v1.s[2]\n"
+ "ldr q23, [x28, #0x100]\n"
+ "fmla v9.4s, v22.4s, v0.s[2]\n"
+ "fmla v15.4s, v22.4s, v1.s[2]\n"
+ "ldr q22, [x28, #0x110]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v16.4s, v21.4s, v1.s[2]\n"
+ "ldr q21, [x28, #0x120]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "fmla v17.4s, v20.4s, v1.s[2]\n"
+ "ldr q20, [x28, #0x130]\n"
+ "fmla v12.4s, v23.4s, v0.s[2]\n"
+ "fmla v18.4s, v23.4s, v1.s[2]\n"
+ "ldr q23, [x28, #0x140]\n"
+ "fmla v13.4s, v22.4s, v0.s[2]\n"
+ "fmla v19.4s, v22.4s, v1.s[2]\n"
+ "ldr q22, [x28, #0x150]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "ldr q21, [x28, #0x160]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "ldr q20, [x28, #0x170]\n"
"add x28, x28, #0x180\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
+ "fmla v10.4s, v23.4s, v0.s[3]\n"
+ "fmla v16.4s, v23.4s, v1.s[3]\n"
"ldr q4, [x28, #0x0]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
+ "fmla v11.4s, v22.4s, v0.s[3]\n"
+ "fmla v17.4s, v22.4s, v1.s[3]\n"
"ldr q5, [x28, #0x10]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v18.4s, v21.4s, v1.s[3]\n"
"ldr q6, [x28, #0x20]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v20.4s, v0.s[3]\n"
"ldr q0, [x24, #0x0]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v20.4s, v1.s[3]\n"
"ldr q1, [x23, #0x0]\n"
"ldr q7, [x28, #0x30]\n"
"bge 63b\n"
"64:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v4.4s, v0.s[0]\n"
"fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q23, [x28, #0x40]\n"
"add x24, x24, #0x10\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q22, [x28, #0x50]\n"
"add x23, x23, #0x10\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q21, [x28, #0x60]\n"
"sub x25, x25, #0x4\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x28, #0x70]\n"
+ "ldr q20, [x28, #0x70]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "ldr q4, [x28, #0x80]\n"
+ "fmla v12.4s, v23.4s, v0.s[0]\n"
+ "fmla v18.4s, v23.4s, v1.s[0]\n"
+ "ldr q23, [x28, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "ldr q5, [x28, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x28, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x28, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "ldr q4, [x28, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "ldr q5, [x28, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x28, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x28, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "ldr q4, [x28, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "ldr q5, [x28, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x28, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x28, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "ldr q4, [x28, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "ldr q5, [x28, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x28, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x28, #0x170]\n"
+ "fmla v13.4s, v22.4s, v0.s[0]\n"
+ "fmla v19.4s, v22.4s, v1.s[0]\n"
+ "ldr q22, [x28, #0x90]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "ldr q21, [x28, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "ldr q20, [x28, #0xb0]\n"
+ "fmla v10.4s, v23.4s, v0.s[1]\n"
+ "fmla v16.4s, v23.4s, v1.s[1]\n"
+ "ldr q23, [x28, #0xc0]\n"
+ "fmla v11.4s, v22.4s, v0.s[1]\n"
+ "fmla v17.4s, v22.4s, v1.s[1]\n"
+ "ldr q22, [x28, #0xd0]\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "fmla v18.4s, v21.4s, v1.s[1]\n"
+ "ldr q21, [x28, #0xe0]\n"
+ "fmla v13.4s, v20.4s, v0.s[1]\n"
+ "fmla v19.4s, v20.4s, v1.s[1]\n"
+ "ldr q20, [x28, #0xf0]\n"
+ "fmla v8.4s, v23.4s, v0.s[2]\n"
+ "fmla v14.4s, v23.4s, v1.s[2]\n"
+ "ldr q23, [x28, #0x100]\n"
+ "fmla v9.4s, v22.4s, v0.s[2]\n"
+ "fmla v15.4s, v22.4s, v1.s[2]\n"
+ "ldr q22, [x28, #0x110]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v16.4s, v21.4s, v1.s[2]\n"
+ "ldr q21, [x28, #0x120]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "fmla v17.4s, v20.4s, v1.s[2]\n"
+ "ldr q20, [x28, #0x130]\n"
+ "fmla v12.4s, v23.4s, v0.s[2]\n"
+ "fmla v18.4s, v23.4s, v1.s[2]\n"
+ "ldr q23, [x28, #0x140]\n"
+ "fmla v13.4s, v22.4s, v0.s[2]\n"
+ "fmla v19.4s, v22.4s, v1.s[2]\n"
+ "ldr q22, [x28, #0x150]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "ldr q21, [x28, #0x160]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "ldr q20, [x28, #0x170]\n"
"add x28, x28, #0x180\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
+ "fmla v10.4s, v23.4s, v0.s[3]\n"
+ "fmla v16.4s, v23.4s, v1.s[3]\n"
+ "fmla v11.4s, v22.4s, v0.s[3]\n"
+ "fmla v17.4s, v22.4s, v1.s[3]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v18.4s, v21.4s, v1.s[3]\n"
+ "fmla v13.4s, v20.4s, v0.s[3]\n"
+ "fmla v19.4s, v20.4s, v1.s[3]\n"
"65:" // Height 2: Multiply loop: Main loop skip
"cbz x25, 67f\n"
"66:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "ldr s1, [x23], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
"sub x25, x25, #0x1\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v15.4s, v5.4s, v1.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
- "ldr q5, [x28, #0x50]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v16.4s, v6.4s, v1.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v17.4s, v7.4s, v1.s[0]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "fmla v8.4s, v21.4s, v25.s[0]\n"
+ "fmla v14.4s, v21.4s, v24.s[0]\n"
+ "ldr q23, [x28, #0x20]\n"
+ "ldr q22, [x28, #0x30]\n"
+ "fmla v9.4s, v20.4s, v25.s[0]\n"
+ "fmla v15.4s, v20.4s, v24.s[0]\n"
+ "ldr q21, [x28, #0x40]\n"
+ "ldr q20, [x28, #0x50]\n"
+ "fmla v10.4s, v23.4s, v25.s[0]\n"
+ "fmla v16.4s, v23.4s, v24.s[0]\n"
+ "fmla v11.4s, v22.4s, v25.s[0]\n"
+ "fmla v17.4s, v22.4s, v24.s[0]\n"
"add x28, x28, #0x60\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
+ "fmla v12.4s, v21.4s, v25.s[0]\n"
+ "fmla v18.4s, v21.4s, v24.s[0]\n"
+ "fmla v13.4s, v20.4s, v25.s[0]\n"
+ "fmla v19.4s, v20.4s, v24.s[0]\n"
"cbnz x25, 66b\n"
"67:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -871,33 +870,33 @@ void a64_hybrid_fp32_mla_4x24 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 68f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v21.4s\n"
+ "fmin v9.4s, v9.4s, v21.4s\n"
+ "fmin v10.4s, v10.4s, v21.4s\n"
+ "fmin v11.4s, v11.4s, v21.4s\n"
+ "fmin v12.4s, v12.4s, v21.4s\n"
+ "fmin v13.4s, v13.4s, v21.4s\n"
+ "fmin v14.4s, v14.4s, v21.4s\n"
+ "fmin v15.4s, v15.4s, v21.4s\n"
+ "fmin v16.4s, v16.4s, v21.4s\n"
+ "fmin v17.4s, v17.4s, v21.4s\n"
+ "fmin v18.4s, v18.4s, v21.4s\n"
+ "fmin v19.4s, v19.4s, v21.4s\n"
+ "fmax v8.4s, v8.4s, v20.4s\n"
+ "fmax v9.4s, v9.4s, v20.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v20.4s\n"
+ "fmax v14.4s, v14.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v20.4s\n"
+ "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v20.4s\n"
+ "fmax v19.4s, v19.4s, v20.4s\n"
"68:" // Height 2: No activation
"cmp x9, #0x18\n"
"bge 81f\n"
@@ -1237,13 +1236,13 @@ void a64_hybrid_fp32_mla_4x24 (
"101:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 102f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 103f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -1252,8 +1251,8 @@ void a64_hybrid_fp32_mla_4x24 (
"b 103f\n"
"102:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"103:" // Height 3: input setup done
"cmp x25, #0x4\n"
"blt 106f\n"
@@ -1272,107 +1271,107 @@ void a64_hybrid_fp32_mla_4x24 (
"sub x25, x25, #0x4\n"
"add x24, x24, #0x10\n"
"fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q29, [x28, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
"add x23, x23, #0x10\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q28, [x28, #0x50]\n"
"add x22, x22, #0x10\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
"cmp x25, #0x8\n"
"prfm pldl1keep, [x24, #0x80]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q27, [x28, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x28, #0x70]\n"
+ "ldr q26, [x28, #0x70]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "ldr q4, [x28, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x28, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v20.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x28, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v21.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x28, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[1]\n"
- "ldr q4, [x28, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "fmla v23.4s, v5.4s, v2.s[1]\n"
- "ldr q5, [x28, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "fmla v24.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x28, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "fmla v25.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x28, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "fmla v20.4s, v4.4s, v2.s[2]\n"
- "ldr q4, [x28, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "fmla v21.4s, v5.4s, v2.s[2]\n"
- "ldr q5, [x28, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "fmla v22.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x28, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "fmla v23.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x28, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "fmla v24.4s, v4.4s, v2.s[2]\n"
- "ldr q4, [x28, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "fmla v25.4s, v5.4s, v2.s[2]\n"
- "ldr q5, [x28, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v20.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x28, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v21.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x28, #0x170]\n"
+ "fmla v12.4s, v29.4s, v0.s[0]\n"
+ "fmla v18.4s, v29.4s, v1.s[0]\n"
+ "fmla v24.4s, v29.4s, v2.s[0]\n"
+ "ldr q29, [x28, #0x80]\n"
+ "fmla v13.4s, v28.4s, v0.s[0]\n"
+ "fmla v19.4s, v28.4s, v1.s[0]\n"
+ "fmla v25.4s, v28.4s, v2.s[0]\n"
+ "ldr q28, [x28, #0x90]\n"
+ "fmla v8.4s, v27.4s, v0.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[1]\n"
+ "fmla v20.4s, v27.4s, v2.s[1]\n"
+ "ldr q27, [x28, #0xa0]\n"
+ "fmla v9.4s, v26.4s, v0.s[1]\n"
+ "fmla v15.4s, v26.4s, v1.s[1]\n"
+ "fmla v21.4s, v26.4s, v2.s[1]\n"
+ "ldr q26, [x28, #0xb0]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v16.4s, v29.4s, v1.s[1]\n"
+ "fmla v22.4s, v29.4s, v2.s[1]\n"
+ "ldr q29, [x28, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v17.4s, v28.4s, v1.s[1]\n"
+ "fmla v23.4s, v28.4s, v2.s[1]\n"
+ "ldr q28, [x28, #0xd0]\n"
+ "fmla v12.4s, v27.4s, v0.s[1]\n"
+ "fmla v18.4s, v27.4s, v1.s[1]\n"
+ "fmla v24.4s, v27.4s, v2.s[1]\n"
+ "ldr q27, [x28, #0xe0]\n"
+ "fmla v13.4s, v26.4s, v0.s[1]\n"
+ "fmla v19.4s, v26.4s, v1.s[1]\n"
+ "fmla v25.4s, v26.4s, v2.s[1]\n"
+ "ldr q26, [x28, #0xf0]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v20.4s, v29.4s, v2.s[2]\n"
+ "ldr q29, [x28, #0x100]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v21.4s, v28.4s, v2.s[2]\n"
+ "ldr q28, [x28, #0x110]\n"
+ "fmla v10.4s, v27.4s, v0.s[2]\n"
+ "fmla v16.4s, v27.4s, v1.s[2]\n"
+ "fmla v22.4s, v27.4s, v2.s[2]\n"
+ "ldr q27, [x28, #0x120]\n"
+ "fmla v11.4s, v26.4s, v0.s[2]\n"
+ "fmla v17.4s, v26.4s, v1.s[2]\n"
+ "fmla v23.4s, v26.4s, v2.s[2]\n"
+ "ldr q26, [x28, #0x130]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v18.4s, v29.4s, v1.s[2]\n"
+ "fmla v24.4s, v29.4s, v2.s[2]\n"
+ "ldr q29, [x28, #0x140]\n"
+ "fmla v13.4s, v28.4s, v0.s[2]\n"
+ "fmla v19.4s, v28.4s, v1.s[2]\n"
+ "fmla v25.4s, v28.4s, v2.s[2]\n"
+ "ldr q28, [x28, #0x150]\n"
+ "fmla v8.4s, v27.4s, v0.s[3]\n"
+ "fmla v14.4s, v27.4s, v1.s[3]\n"
+ "fmla v20.4s, v27.4s, v2.s[3]\n"
+ "ldr q27, [x28, #0x160]\n"
+ "fmla v9.4s, v26.4s, v0.s[3]\n"
+ "fmla v15.4s, v26.4s, v1.s[3]\n"
+ "fmla v21.4s, v26.4s, v2.s[3]\n"
+ "ldr q26, [x28, #0x170]\n"
"add x28, x28, #0x180\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
- "fmla v22.4s, v4.4s, v2.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "fmla v16.4s, v29.4s, v1.s[3]\n"
+ "fmla v22.4s, v29.4s, v2.s[3]\n"
"ldr q4, [x28, #0x0]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
- "fmla v23.4s, v5.4s, v2.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "fmla v17.4s, v28.4s, v1.s[3]\n"
+ "fmla v23.4s, v28.4s, v2.s[3]\n"
"ldr q5, [x28, #0x10]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
- "fmla v24.4s, v6.4s, v2.s[3]\n"
+ "fmla v12.4s, v27.4s, v0.s[3]\n"
+ "fmla v18.4s, v27.4s, v1.s[3]\n"
+ "fmla v24.4s, v27.4s, v2.s[3]\n"
"ldr q6, [x28, #0x20]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v26.4s, v0.s[3]\n"
"ldr q0, [x24, #0x0]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v26.4s, v1.s[3]\n"
"ldr q1, [x23, #0x0]\n"
- "fmla v25.4s, v7.4s, v2.s[3]\n"
+ "fmla v25.4s, v26.4s, v2.s[3]\n"
"ldr q2, [x22, #0x0]\n"
"ldr q7, [x28, #0x30]\n"
"bge 104b\n"
@@ -1382,133 +1381,133 @@ void a64_hybrid_fp32_mla_4x24 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"fmla v20.4s, v4.4s, v2.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q29, [x28, #0x40]\n"
"fmla v9.4s, v5.4s, v0.s[0]\n"
"add x22, x22, #0x10\n"
"fmla v15.4s, v5.4s, v1.s[0]\n"
"fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q28, [x28, #0x50]\n"
"sub x25, x25, #0x4\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v16.4s, v6.4s, v1.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"fmla v22.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q27, [x28, #0x60]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
"fmla v17.4s, v7.4s, v1.s[0]\n"
"fmla v23.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x28, #0x70]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "ldr q4, [x28, #0x80]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x28, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v20.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x28, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v21.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x28, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v1.s[1]\n"
- "fmla v22.4s, v4.4s, v2.s[1]\n"
- "ldr q4, [x28, #0xc0]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v17.4s, v5.4s, v1.s[1]\n"
- "fmla v23.4s, v5.4s, v2.s[1]\n"
- "ldr q5, [x28, #0xd0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v18.4s, v6.4s, v1.s[1]\n"
- "fmla v24.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x28, #0xe0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v19.4s, v7.4s, v1.s[1]\n"
- "fmla v25.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x28, #0xf0]\n"
- "fmla v8.4s, v4.4s, v0.s[2]\n"
- "fmla v14.4s, v4.4s, v1.s[2]\n"
- "fmla v20.4s, v4.4s, v2.s[2]\n"
- "ldr q4, [x28, #0x100]\n"
- "fmla v9.4s, v5.4s, v0.s[2]\n"
- "fmla v15.4s, v5.4s, v1.s[2]\n"
- "fmla v21.4s, v5.4s, v2.s[2]\n"
- "ldr q5, [x28, #0x110]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v16.4s, v6.4s, v1.s[2]\n"
- "fmla v22.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x28, #0x120]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v1.s[2]\n"
- "fmla v23.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x28, #0x130]\n"
- "fmla v12.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v1.s[2]\n"
- "fmla v24.4s, v4.4s, v2.s[2]\n"
- "ldr q4, [x28, #0x140]\n"
- "fmla v13.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v1.s[2]\n"
- "fmla v25.4s, v5.4s, v2.s[2]\n"
- "ldr q5, [x28, #0x150]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v20.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x28, #0x160]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v21.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x28, #0x170]\n"
+ "ldr q26, [x28, #0x70]\n"
+ "fmla v12.4s, v29.4s, v0.s[0]\n"
+ "fmla v18.4s, v29.4s, v1.s[0]\n"
+ "fmla v24.4s, v29.4s, v2.s[0]\n"
+ "ldr q29, [x28, #0x80]\n"
+ "fmla v13.4s, v28.4s, v0.s[0]\n"
+ "fmla v19.4s, v28.4s, v1.s[0]\n"
+ "fmla v25.4s, v28.4s, v2.s[0]\n"
+ "ldr q28, [x28, #0x90]\n"
+ "fmla v8.4s, v27.4s, v0.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[1]\n"
+ "fmla v20.4s, v27.4s, v2.s[1]\n"
+ "ldr q27, [x28, #0xa0]\n"
+ "fmla v9.4s, v26.4s, v0.s[1]\n"
+ "fmla v15.4s, v26.4s, v1.s[1]\n"
+ "fmla v21.4s, v26.4s, v2.s[1]\n"
+ "ldr q26, [x28, #0xb0]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v16.4s, v29.4s, v1.s[1]\n"
+ "fmla v22.4s, v29.4s, v2.s[1]\n"
+ "ldr q29, [x28, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v17.4s, v28.4s, v1.s[1]\n"
+ "fmla v23.4s, v28.4s, v2.s[1]\n"
+ "ldr q28, [x28, #0xd0]\n"
+ "fmla v12.4s, v27.4s, v0.s[1]\n"
+ "fmla v18.4s, v27.4s, v1.s[1]\n"
+ "fmla v24.4s, v27.4s, v2.s[1]\n"
+ "ldr q27, [x28, #0xe0]\n"
+ "fmla v13.4s, v26.4s, v0.s[1]\n"
+ "fmla v19.4s, v26.4s, v1.s[1]\n"
+ "fmla v25.4s, v26.4s, v2.s[1]\n"
+ "ldr q26, [x28, #0xf0]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v20.4s, v29.4s, v2.s[2]\n"
+ "ldr q29, [x28, #0x100]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v21.4s, v28.4s, v2.s[2]\n"
+ "ldr q28, [x28, #0x110]\n"
+ "fmla v10.4s, v27.4s, v0.s[2]\n"
+ "fmla v16.4s, v27.4s, v1.s[2]\n"
+ "fmla v22.4s, v27.4s, v2.s[2]\n"
+ "ldr q27, [x28, #0x120]\n"
+ "fmla v11.4s, v26.4s, v0.s[2]\n"
+ "fmla v17.4s, v26.4s, v1.s[2]\n"
+ "fmla v23.4s, v26.4s, v2.s[2]\n"
+ "ldr q26, [x28, #0x130]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v18.4s, v29.4s, v1.s[2]\n"
+ "fmla v24.4s, v29.4s, v2.s[2]\n"
+ "ldr q29, [x28, #0x140]\n"
+ "fmla v13.4s, v28.4s, v0.s[2]\n"
+ "fmla v19.4s, v28.4s, v1.s[2]\n"
+ "fmla v25.4s, v28.4s, v2.s[2]\n"
+ "ldr q28, [x28, #0x150]\n"
+ "fmla v8.4s, v27.4s, v0.s[3]\n"
+ "fmla v14.4s, v27.4s, v1.s[3]\n"
+ "fmla v20.4s, v27.4s, v2.s[3]\n"
+ "ldr q27, [x28, #0x160]\n"
+ "fmla v9.4s, v26.4s, v0.s[3]\n"
+ "fmla v15.4s, v26.4s, v1.s[3]\n"
+ "fmla v21.4s, v26.4s, v2.s[3]\n"
+ "ldr q26, [x28, #0x170]\n"
"add x28, x28, #0x180\n"
- "fmla v10.4s, v4.4s, v0.s[3]\n"
- "fmla v16.4s, v4.4s, v1.s[3]\n"
- "fmla v22.4s, v4.4s, v2.s[3]\n"
- "fmla v11.4s, v5.4s, v0.s[3]\n"
- "fmla v17.4s, v5.4s, v1.s[3]\n"
- "fmla v23.4s, v5.4s, v2.s[3]\n"
- "fmla v12.4s, v6.4s, v0.s[3]\n"
- "fmla v18.4s, v6.4s, v1.s[3]\n"
- "fmla v24.4s, v6.4s, v2.s[3]\n"
- "fmla v13.4s, v7.4s, v0.s[3]\n"
- "fmla v19.4s, v7.4s, v1.s[3]\n"
- "fmla v25.4s, v7.4s, v2.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "fmla v16.4s, v29.4s, v1.s[3]\n"
+ "fmla v22.4s, v29.4s, v2.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "fmla v17.4s, v28.4s, v1.s[3]\n"
+ "fmla v23.4s, v28.4s, v2.s[3]\n"
+ "fmla v12.4s, v27.4s, v0.s[3]\n"
+ "fmla v18.4s, v27.4s, v1.s[3]\n"
+ "fmla v24.4s, v27.4s, v2.s[3]\n"
+ "fmla v13.4s, v26.4s, v0.s[3]\n"
+ "fmla v19.4s, v26.4s, v1.s[3]\n"
+ "fmla v25.4s, v26.4s, v2.s[3]\n"
"106:" // Height 3: Multiply loop: Main loop skip
"cbz x25, 108f\n"
"107:" // Height 3: Multiply loop: Odd block loop
"ldr s0, [x24], #0x4\n"
- "ldr s1, [x23], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
"sub x25, x25, #0x1\n"
- "ldr s2, [x22], #0x4\n"
- "ldr q4, [x28, #0x0]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q5, [x28, #0x10]\n"
- "ldr q6, [x28, #0x20]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "ldr q7, [x28, #0x30]\n"
- "ldr q4, [x28, #0x40]\n"
- "fmla v15.4s, v5.4s, v1.s[0]\n"
- "fmla v21.4s, v5.4s, v2.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v16.4s, v6.4s, v1.s[0]\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q27, [x28, #0x0]\n"
+ "fmla v8.4s, v27.4s, v0.s[0]\n"
+ "fmla v14.4s, v27.4s, v31.s[0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "fmla v20.4s, v27.4s, v30.s[0]\n"
+ "fmla v9.4s, v26.4s, v0.s[0]\n"
+ "ldr q28, [x28, #0x30]\n"
+ "ldr q27, [x28, #0x40]\n"
+ "fmla v15.4s, v26.4s, v31.s[0]\n"
+ "fmla v21.4s, v26.4s, v30.s[0]\n"
+ "ldr q26, [x28, #0x50]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "fmla v16.4s, v29.4s, v31.s[0]\n"
"add x28, x28, #0x60\n"
- "fmla v22.4s, v6.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v17.4s, v7.4s, v1.s[0]\n"
- "fmla v23.4s, v7.4s, v2.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
+ "fmla v22.4s, v29.4s, v30.s[0]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v17.4s, v28.4s, v31.s[0]\n"
+ "fmla v23.4s, v28.4s, v30.s[0]\n"
+ "fmla v12.4s, v27.4s, v0.s[0]\n"
+ "fmla v18.4s, v27.4s, v31.s[0]\n"
+ "fmla v24.4s, v27.4s, v30.s[0]\n"
+ "fmla v13.4s, v26.4s, v0.s[0]\n"
+ "fmla v19.4s, v26.4s, v31.s[0]\n"
+ "fmla v25.4s, v26.4s, v30.s[0]\n"
"cbnz x25, 107b\n"
"108:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1523,45 +1522,45 @@ void a64_hybrid_fp32_mla_4x24 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 109f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v27.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmin v24.4s, v24.4s, v1.4s\n"
- "fmin v25.4s, v25.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v0.4s\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v27.4s\n"
+ "fmin v9.4s, v9.4s, v27.4s\n"
+ "fmin v10.4s, v10.4s, v27.4s\n"
+ "fmin v11.4s, v11.4s, v27.4s\n"
+ "fmin v12.4s, v12.4s, v27.4s\n"
+ "fmin v13.4s, v13.4s, v27.4s\n"
+ "fmin v14.4s, v14.4s, v27.4s\n"
+ "fmin v15.4s, v15.4s, v27.4s\n"
+ "fmin v16.4s, v16.4s, v27.4s\n"
+ "fmin v17.4s, v17.4s, v27.4s\n"
+ "fmin v18.4s, v18.4s, v27.4s\n"
+ "fmin v19.4s, v19.4s, v27.4s\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmax v8.4s, v8.4s, v26.4s\n"
+ "fmax v9.4s, v9.4s, v26.4s\n"
+ "fmax v10.4s, v10.4s, v26.4s\n"
+ "fmax v11.4s, v11.4s, v26.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
"109:" // Height 3: No activation
"cmp x9, #0x18\n"
"bge 122f\n"
@@ -1983,14 +1982,14 @@ void a64_hybrid_fp32_mla_4x24 (
"142:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 143f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 144f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -2000,9 +1999,9 @@ void a64_hybrid_fp32_mla_4x24 (
"b 144f\n"
"143:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"144:" // Height 4: input setup done
"cmp x25, #0x4\n"
"blt 147f\n"
@@ -2283,42 +2282,42 @@ void a64_hybrid_fp32_mla_4x24 (
"147:" // Height 4: Multiply loop: Main loop skip
"cbz x25, 149f\n"
"148:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x24], #0x4\n"
- "ldr s1, [x23], #0x4\n"
+ "ldr s7, [x24], #0x4\n"
+ "ldr s6, [x23], #0x4\n"
"sub x25, x25, #0x1\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s3, [x21], #0x4\n"
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "fmla v14.4s, v4.4s, v1.s[0]\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- "fmla v20.4s, v4.4s, v2.s[0]\n"
- "fmla v26.4s, v4.4s, v3.s[0]\n"
- "ldr q4, [x28, #0x40]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v15.4s, v5.4s, v1.s[0]\n"
- "fmla v21.4s, v5.4s, v2.s[0]\n"
- "fmla v27.4s, v5.4s, v3.s[0]\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr s5, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q0, [x28, #0x10]\n"
+ "fmla v8.4s, v1.4s, v7.s[0]\n"
+ "fmla v14.4s, v1.4s, v6.s[0]\n"
+ "ldr q3, [x28, #0x20]\n"
+ "ldr q2, [x28, #0x30]\n"
+ "fmla v20.4s, v1.4s, v5.s[0]\n"
+ "fmla v26.4s, v1.4s, v4.s[0]\n"
+ "ldr q1, [x28, #0x40]\n"
+ "fmla v9.4s, v0.4s, v7.s[0]\n"
+ "fmla v15.4s, v0.4s, v6.s[0]\n"
+ "fmla v21.4s, v0.4s, v5.s[0]\n"
+ "fmla v27.4s, v0.4s, v4.s[0]\n"
+ "ldr q0, [x28, #0x50]\n"
"add x28, x28, #0x60\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v16.4s, v6.4s, v1.s[0]\n"
- "fmla v22.4s, v6.4s, v2.s[0]\n"
- "fmla v28.4s, v6.4s, v3.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v17.4s, v7.4s, v1.s[0]\n"
- "fmla v23.4s, v7.4s, v2.s[0]\n"
- "fmla v29.4s, v7.4s, v3.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[0]\n"
- "fmla v18.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v2.s[0]\n"
- "fmla v30.4s, v4.4s, v3.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[0]\n"
- "fmla v19.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v2.s[0]\n"
- "fmla v31.4s, v5.4s, v3.s[0]\n"
+ "fmla v10.4s, v3.4s, v7.s[0]\n"
+ "fmla v16.4s, v3.4s, v6.s[0]\n"
+ "fmla v22.4s, v3.4s, v5.s[0]\n"
+ "fmla v28.4s, v3.4s, v4.s[0]\n"
+ "fmla v11.4s, v2.4s, v7.s[0]\n"
+ "fmla v17.4s, v2.4s, v6.s[0]\n"
+ "fmla v23.4s, v2.4s, v5.s[0]\n"
+ "fmla v29.4s, v2.4s, v4.s[0]\n"
+ "fmla v12.4s, v1.4s, v7.s[0]\n"
+ "fmla v18.4s, v1.4s, v6.s[0]\n"
+ "fmla v24.4s, v1.4s, v5.s[0]\n"
+ "fmla v30.4s, v1.4s, v4.s[0]\n"
+ "fmla v13.4s, v0.4s, v7.s[0]\n"
+ "fmla v19.4s, v0.4s, v6.s[0]\n"
+ "fmla v25.4s, v0.4s, v5.s[0]\n"
+ "fmla v31.4s, v0.4s, v4.s[0]\n"
"cbnz x25, 148b\n"
"149:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2584,7 +2583,6 @@ void a64_hybrid_fp32_mla_4x24 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"166:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 4cfa18bb84..759729de5e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -113,5 +113,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
index 985d57d9b6..ddbc840829 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_6x16_a55 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 166f\n"
@@ -189,11 +188,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
"cbnz x15, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #2\n"
@@ -210,126 +209,126 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr d6, [x17, #0x20]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr d17, [x17, #0x20]\n"
+ "ldr x20, [x17, #0x28]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x38]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "ldr x12, [x17, #0x48]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x58]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x78]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x98]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xb8]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xd8]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xf8]\n"
- "mov v7.d[1], x11\n"
+ "ldr d16, [x17, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x38]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "ldr d17, [x17, #0x40]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr d16, [x17, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr d17, [x17, #0x60]\n"
+ "ldr x20, [x17, #0x68]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr d16, [x17, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x78]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "ldr d17, [x17, #0x80]\n"
+ "ldr x20, [x17, #0x88]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr d16, [x17, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr d17, [x17, #0xa0]\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr d16, [x17, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr d17, [x17, #0xc0]\n"
+ "ldr x20, [x17, #0xc8]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr d16, [x17, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr d17, [x17, #0xe0]\n"
+ "ldr x20, [x17, #0xe8]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr d16, [x17, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "mov v16.d[1], x20\n"
"add x13, x13, #0x10\n"
"add x17, x17, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
"ldr d6, [x17, #0x0]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr x20, [x17, #0x8]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
"sub x14, x14, #0x4\n"
"ldr d7, [x17, #0x10]\n"
"cmp x14, #0x8\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x18]\n"
- "mov v0.d[1], x10\n"
- "mov v7.d[1], x11\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v0.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"prfm pldl1keep, [x13, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q17, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x17, #0xf0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "ldr q17, [x17, #0x40]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr q16, [x17, #0x50]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x17, #0x60]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x17, #0x70]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x17, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x17, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x17, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x17, #0xb0]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x17, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x17, #0xd0]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x17, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x17, #0xf0]\n"
"add x13, x13, #0x10\n"
"sub x14, x14, #0x4\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"add x17, x17, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s17, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v8.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x17, #0x10]\n"
+ "fmla v9.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x17, #0x20]\n"
+ "fmla v10.4s, v16.4s, v17.s[0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v11.4s, v16.4s, v17.s[0]\n"
"add x17, x17, #0x40\n"
"cbnz x14, 21b\n"
"22:" // Height 1: Multiply loop: No odd multiplies
@@ -340,17 +339,17 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"prfm pstl1keep, [x16, #0x0]\n"
"tbz %x[flags], #1, 23f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v16.4s\n"
+ "fmin v9.4s, v9.4s, v16.4s\n"
+ "fmin v10.4s, v10.4s, v16.4s\n"
+ "fmin v11.4s, v11.4s, v16.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
"23:" // Height 1: No activation
"cmp x8, #0x10\n"
"bge 32f\n"
@@ -528,196 +527,196 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
"cbnz x15, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #2\n"
- "add x9, x9, x20, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n"
"b 50f\n"
"49:" // Height 2: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #2\n"
+ "add x12, x13, x21, LSL #2\n"
"50:" // Height 2: input setup done
"cmp x14, #0x4\n"
"blt 53f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x8\n"
- "ldr q1, [x9, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 52f\n"
"51:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d17, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr x12, [x17, #0x48]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x58]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x98]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v6.d[1], x12\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "mov v7.d[1], x11\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0xd8]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v6.d[1], x12\n"
+ "ldr d16, [x17, #0x30]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.4s, v17.4s, v1.s[0]\n"
+ "ldr d17, [x17, #0x40]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla v15.4s, v16.4s, v1.s[0]\n"
+ "ldr d16, [x17, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v12.4s, v17.4s, v1.s[1]\n"
+ "ldr d17, [x17, #0x60]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v13.4s, v16.4s, v1.s[1]\n"
+ "ldr d16, [x17, #0x70]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.4s, v17.4s, v1.s[1]\n"
+ "ldr d17, [x17, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr x20, [x17, #0x88]\n"
+ "fmla v15.4s, v16.4s, v1.s[1]\n"
+ "ldr d16, [x17, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v12.4s, v17.4s, v1.s[2]\n"
+ "ldr d17, [x17, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v13.4s, v16.4s, v1.s[2]\n"
+ "ldr d16, [x17, #0xb0]\n"
+ "mov v17.d[1], x21\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v14.4s, v17.4s, v1.s[2]\n"
+ "ldr d17, [x17, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr x20, [x17, #0xc8]\n"
+ "fmla v15.4s, v16.4s, v1.s[2]\n"
+ "ldr d16, [x17, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v12.4s, v17.4s, v1.s[3]\n"
+ "ldr d17, [x17, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v13.4s, v16.4s, v1.s[3]\n"
+ "ldr d16, [x17, #0xf0]\n"
+ "mov v17.d[1], x21\n"
"add x13, x13, #0x10\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"add x17, x17, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v14.4s, v17.4s, v1.s[3]\n"
"ldr d6, [x17, #0x0]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "fmla v15.4s, v16.4s, v1.s[3]\n"
+ "ldr d1, [x12, #0x0]\n"
"sub x14, x14, #0x4\n"
"ldr d7, [x17, #0x10]\n"
"cmp x14, #0x8\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x28, [x9, #0x8]\n"
- "mov v0.d[1], x10\n"
- "ldr x11, [x17, #0x18]\n"
- "mov v1.d[1], x28\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v1.d[1], x21\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v7.d[1], x11\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q17, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
"sub x14, x14, #0x4\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v14.4s, v17.4s, v1.s[0]\n"
+ "ldr q17, [x17, #0x40]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v15.4s, v16.4s, v1.s[0]\n"
+ "ldr q16, [x17, #0x50]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v12.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x17, #0x60]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "fmla v13.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x17, #0x70]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "fmla v14.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x17, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "fmla v15.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x17, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "fmla v12.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x17, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "fmla v13.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x17, #0xb0]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "fmla v14.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x17, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "fmla v15.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x17, #0xd0]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v1.s[3]\n"
+ "ldr q17, [x17, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v1.s[3]\n"
+ "ldr q16, [x17, #0xf0]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
"add x17, x17, #0x100\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v14.4s, v17.4s, v1.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
+ "fmla v15.4s, v16.4s, v1.s[3]\n"
"53:" // Height 2: Multiply loop: Main loop skip
"cbz x14, 55f\n"
"54:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s19, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s18, [x12], #0x4\n"
+ "ldr q17, [x17, #0x0]\n"
+ "fmla v8.4s, v17.4s, v19.s[0]\n"
+ "ldr q16, [x17, #0x10]\n"
+ "fmla v12.4s, v17.4s, v18.s[0]\n"
+ "ldr q17, [x17, #0x20]\n"
+ "fmla v9.4s, v16.4s, v19.s[0]\n"
+ "fmla v13.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x17, #0x30]\n"
+ "fmla v10.4s, v17.4s, v19.s[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v14.4s, v17.4s, v18.s[0]\n"
+ "fmla v11.4s, v16.4s, v19.s[0]\n"
+ "fmla v15.4s, v16.4s, v18.s[0]\n"
"cbnz x14, 54b\n"
"55:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -730,25 +729,25 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 56f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
- "fmin v14.4s, v14.4s, v0.4s\n"
- "fmin v15.4s, v15.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v16.4s\n"
+ "fmin v9.4s, v9.4s, v16.4s\n"
+ "fmin v10.4s, v10.4s, v16.4s\n"
+ "fmin v11.4s, v11.4s, v16.4s\n"
+ "fmin v12.4s, v12.4s, v16.4s\n"
+ "fmin v13.4s, v13.4s, v16.4s\n"
+ "fmin v14.4s, v14.4s, v16.4s\n"
+ "fmin v15.4s, v15.4s, v16.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
+ "fmax v12.4s, v12.4s, v16.4s\n"
+ "fmax v13.4s, v13.4s, v16.4s\n"
+ "fmax v14.4s, v14.4s, v16.4s\n"
+ "fmax v15.4s, v15.4s, v16.4s\n"
"56:" // Height 2: No activation
"cmp x8, #0x10\n"
"bge 65f\n"
@@ -975,244 +974,244 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"81:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 82f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
"cbnz x15, 83f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #2\n"
- "add x9, x9, x20, LSL #2\n"
- "add x27, x27, x20, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n"
+ "add x11, x11, x20, LSL #2\n"
"b 83f\n"
"82:" // Height 3: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #2\n"
- "add x27, x9, x20, LSL #2\n"
+ "add x12, x13, x21, LSL #2\n"
+ "add x11, x12, x21, LSL #2\n"
"83:" // Height 3: input setup done
"cmp x14, #0x4\n"
"blt 86f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x8\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 85f\n"
"84:" // Height 3: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d21, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v21.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x58]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x98]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xd8]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr d20, [x17, #0x30]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.4s, v21.4s, v0.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[0]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmla v18.4s, v21.4s, v2.s[0]\n"
+ "ldr d21, [x17, #0x40]\n"
+ "fmla v11.4s, v20.4s, v0.s[0]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.4s, v20.4s, v1.s[0]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v19.4s, v20.4s, v2.s[0]\n"
+ "ldr d20, [x17, #0x50]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v12.4s, v21.4s, v1.s[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v16.4s, v21.4s, v2.s[1]\n"
+ "ldr d21, [x17, #0x60]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla v17.4s, v20.4s, v2.s[1]\n"
+ "ldr d20, [x17, #0x70]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla v18.4s, v21.4s, v2.s[1]\n"
+ "ldr d21, [x17, #0x80]\n"
+ "fmla v11.4s, v20.4s, v0.s[1]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v19.4s, v20.4s, v2.s[1]\n"
+ "ldr d20, [x17, #0x90]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.4s, v21.4s, v0.s[2]\n"
+ "fmla v12.4s, v21.4s, v1.s[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v16.4s, v21.4s, v2.s[2]\n"
+ "ldr d21, [x17, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[2]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.4s, v20.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla v17.4s, v20.4s, v2.s[2]\n"
+ "ldr d20, [x17, #0xb0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v21.4s, v1.s[2]\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "fmla v18.4s, v21.4s, v2.s[2]\n"
+ "ldr d21, [x17, #0xc0]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v15.4s, v20.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v19.4s, v20.4s, v2.s[2]\n"
+ "ldr d20, [x17, #0xd0]\n"
+ "mov v20.d[1], x20\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v12.4s, v21.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "ldr d21, [x17, #0xe0]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "mov v21.d[1], x21\n"
+ "fmla v13.4s, v20.4s, v1.s[3]\n"
"add x13, x13, #0x10\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
+ "fmla v17.4s, v20.4s, v2.s[3]\n"
+ "ldr d20, [x17, #0xf0]\n"
+ "mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"add x17, x17, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr x10, [x13, #0x8]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v10.4s, v21.4s, v0.s[3]\n"
+ "ldr x20, [x17, #0x8]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "ldr x23, [x13, #0x8]\n"
+ "fmla v18.4s, v21.4s, v2.s[3]\n"
"ldr d6, [x17, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v20.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "ldr x28, [x9, #0x8]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
+ "fmla v19.4s, v20.4s, v2.s[3]\n"
+ "ldr d2, [x11, #0x0]\n"
"sub x14, x14, #0x4\n"
"ldr d7, [x17, #0x10]\n"
"cmp x14, #0x8\n"
- "ldr x26, [x27, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x17, #0x18]\n"
- "mov v0.d[1], x10\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x17, #0x18]\n"
+ "mov v0.d[1], x23\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v1.d[1], x28\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "mov v2.d[1], x26\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "mov v7.d[1], x11\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v7.d[1], x20\n"
"bge 84b\n"
"85:" // Height 3: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q21, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"sub x14, x14, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q20, [x17, #0x30]\n"
+ "fmla v10.4s, v21.4s, v0.s[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v18.4s, v21.4s, v2.s[0]\n"
+ "ldr q21, [x17, #0x40]\n"
+ "fmla v11.4s, v20.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v15.4s, v20.4s, v1.s[0]\n"
+ "fmla v19.4s, v20.4s, v2.s[0]\n"
+ "ldr q20, [x17, #0x50]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v12.4s, v21.4s, v1.s[1]\n"
+ "fmla v16.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x17, #0x60]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v17.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x17, #0x70]\n"
+ "fmla v10.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v18.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x17, #0x80]\n"
+ "fmla v11.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "fmla v19.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x17, #0x90]\n"
+ "fmla v8.4s, v21.4s, v0.s[2]\n"
+ "fmla v12.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x17, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[2]\n"
+ "fmla v13.4s, v20.4s, v1.s[2]\n"
+ "fmla v17.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x17, #0xb0]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v21.4s, v1.s[2]\n"
+ "fmla v18.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x17, #0xc0]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "fmla v15.4s, v20.4s, v1.s[2]\n"
+ "fmla v19.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x17, #0xd0]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v12.4s, v21.4s, v1.s[3]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "ldr q21, [x17, #0xe0]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "fmla v13.4s, v20.4s, v1.s[3]\n"
+ "fmla v17.4s, v20.4s, v2.s[3]\n"
+ "ldr q20, [x17, #0xf0]\n"
+ "fmla v10.4s, v21.4s, v0.s[3]\n"
"add x17, x17, #0x100\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "fmla v18.4s, v21.4s, v2.s[3]\n"
+ "fmla v11.4s, v20.4s, v0.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "fmla v19.4s, v20.4s, v2.s[3]\n"
"86:" // Height 3: Multiply loop: Main loop skip
"cbz x14, 88f\n"
"87:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s24, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s23, [x12], #0x4\n"
+ "ldr s22, [x11], #0x4\n"
+ "ldr q21, [x17, #0x0]\n"
+ "fmla v8.4s, v21.4s, v24.s[0]\n"
+ "ldr q20, [x17, #0x10]\n"
+ "fmla v12.4s, v21.4s, v23.s[0]\n"
+ "fmla v16.4s, v21.4s, v22.s[0]\n"
+ "ldr q21, [x17, #0x20]\n"
+ "fmla v9.4s, v20.4s, v24.s[0]\n"
+ "fmla v13.4s, v20.4s, v23.s[0]\n"
+ "fmla v17.4s, v20.4s, v22.s[0]\n"
+ "ldr q20, [x17, #0x30]\n"
+ "fmla v10.4s, v21.4s, v24.s[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v14.4s, v21.4s, v23.s[0]\n"
+ "fmla v18.4s, v21.4s, v22.s[0]\n"
+ "fmla v11.4s, v20.4s, v24.s[0]\n"
+ "fmla v15.4s, v20.4s, v23.s[0]\n"
+ "fmla v19.4s, v20.4s, v22.s[0]\n"
"cbnz x14, 87b\n"
"88:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1227,33 +1226,33 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 89f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
- "fmin v14.4s, v14.4s, v0.4s\n"
- "fmin v15.4s, v15.4s, v0.4s\n"
- "fmin v16.4s, v16.4s, v0.4s\n"
- "fmin v17.4s, v17.4s, v0.4s\n"
- "fmin v18.4s, v18.4s, v0.4s\n"
- "fmin v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v20.4s\n"
+ "fmin v9.4s, v9.4s, v20.4s\n"
+ "fmin v10.4s, v10.4s, v20.4s\n"
+ "fmin v11.4s, v11.4s, v20.4s\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v20.4s\n"
+ "fmax v9.4s, v9.4s, v20.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v20.4s\n"
+ "fmax v14.4s, v14.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v20.4s\n"
+ "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v20.4s\n"
+ "fmax v19.4s, v19.4s, v20.4s\n"
"89:" // Height 3: No activation
"cmp x8, #0x10\n"
"bge 98f\n"
@@ -1529,292 +1528,292 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"114:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
"cbnz x15, 116f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #2\n"
- "add x9, x9, x20, LSL #2\n"
- "add x27, x27, x20, LSL #2\n"
- "add x25, x25, x20, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n"
+ "add x11, x11, x20, LSL #2\n"
+ "add x10, x10, x20, LSL #2\n"
"b 116f\n"
"115:" // Height 4: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #2\n"
- "add x27, x9, x20, LSL #2\n"
- "add x25, x27, x20, LSL #2\n"
+ "add x12, x13, x21, LSL #2\n"
+ "add x11, x12, x21, LSL #2\n"
+ "add x10, x11, x21, LSL #2\n"
"116:" // Height 4: input setup done
"cmp x14, #0x4\n"
"blt 119f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x8\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 118f\n"
"117:" // Height 4: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d25, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v25.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x58]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "add x27, x27, #0x10\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "add x25, x25, #0x10\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr x10, [x13, #0x8]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr x28, [x9, #0x8]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x98]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr x26, [x27, #0x8]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr x24, [x25, #0x8]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "ldr d24, [x17, #0x30]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.4s, v25.4s, v0.s[0]\n"
+ "fmla v14.4s, v25.4s, v1.s[0]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmla v18.4s, v25.4s, v2.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v22.4s, v25.4s, v3.s[0]\n"
+ "ldr d25, [x17, #0x40]\n"
+ "fmla v11.4s, v24.4s, v0.s[0]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.4s, v24.4s, v1.s[0]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v19.4s, v24.4s, v2.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v23.4s, v24.4s, v3.s[0]\n"
+ "ldr d24, [x17, #0x50]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.4s, v25.4s, v0.s[1]\n"
+ "fmla v12.4s, v25.4s, v1.s[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v16.4s, v25.4s, v2.s[1]\n"
+ "ldr x25, [x13, #0x8]\n"
+ "fmla v20.4s, v25.4s, v3.s[1]\n"
+ "ldr d25, [x17, #0x60]\n"
+ "fmla v9.4s, v24.4s, v0.s[1]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.4s, v24.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "ldr x24, [x12, #0x8]\n"
+ "fmla v21.4s, v24.4s, v3.s[1]\n"
+ "ldr d24, [x17, #0x70]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.4s, v25.4s, v0.s[1]\n"
+ "fmla v14.4s, v25.4s, v1.s[1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla v18.4s, v25.4s, v2.s[1]\n"
+ "ldr x23, [x11, #0x8]\n"
+ "fmla v22.4s, v25.4s, v3.s[1]\n"
+ "ldr d25, [x17, #0x80]\n"
+ "fmla v11.4s, v24.4s, v0.s[1]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.4s, v24.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v19.4s, v24.4s, v2.s[1]\n"
+ "ldr x22, [x10, #0x8]\n"
+ "fmla v23.4s, v24.4s, v3.s[1]\n"
+ "ldr d24, [x17, #0x90]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.4s, v25.4s, v0.s[2]\n"
+ "fmla v12.4s, v25.4s, v1.s[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v16.4s, v25.4s, v2.s[2]\n"
"sub x14, x14, #0x4\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v20.4s, v25.4s, v3.s[2]\n"
+ "ldr d25, [x17, #0xa0]\n"
+ "fmla v9.4s, v24.4s, v0.s[2]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.4s, v24.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla v17.4s, v24.4s, v2.s[2]\n"
"cmp x14, #0x8\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xd8]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v21.4s, v24.4s, v3.s[2]\n"
+ "ldr d24, [x17, #0xb0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v10.4s, v25.4s, v0.s[2]\n"
+ "fmla v14.4s, v25.4s, v1.s[2]\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "fmla v18.4s, v25.4s, v2.s[2]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "fmla v22.4s, v25.4s, v3.s[2]\n"
+ "ldr d25, [x17, #0xc0]\n"
+ "fmla v11.4s, v24.4s, v0.s[2]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v15.4s, v24.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v19.4s, v24.4s, v2.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v23.4s, v24.4s, v3.s[2]\n"
+ "ldr d24, [x17, #0xd0]\n"
+ "mov v24.d[1], x20\n"
+ "fmla v8.4s, v25.4s, v0.s[3]\n"
+ "fmla v12.4s, v25.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v16.4s, v25.4s, v2.s[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v20.4s, v25.4s, v3.s[3]\n"
+ "ldr d25, [x17, #0xe0]\n"
+ "fmla v9.4s, v24.4s, v0.s[3]\n"
+ "mov v25.d[1], x21\n"
+ "fmla v13.4s, v24.4s, v1.s[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v17.4s, v24.4s, v2.s[3]\n"
+ "fmla v21.4s, v24.4s, v3.s[3]\n"
+ "ldr d24, [x17, #0xf0]\n"
+ "mov v24.d[1], x20\n"
"add x17, x17, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0x18]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v10.4s, v25.4s, v0.s[3]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v14.4s, v25.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0x18]\n"
+ "fmla v18.4s, v25.4s, v2.s[3]\n"
+ "fmla v22.4s, v25.4s, v3.s[3]\n"
"ldr d6, [x17, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v24.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ "fmla v15.4s, v24.4s, v1.s[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "ldr d2, [x11, #0x0]\n"
+ "fmla v23.4s, v24.4s, v3.s[3]\n"
+ "ldr d3, [x10, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
- "mov v7.d[1], x11\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x25\n"
+ "mov v1.d[1], x24\n"
+ "mov v2.d[1], x23\n"
+ "mov v3.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 117b\n"
"118:" // Height 4: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q25, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"sub x14, x14, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "ldr q24, [x17, #0x30]\n"
+ "fmla v10.4s, v25.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v14.4s, v25.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v18.4s, v25.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v22.4s, v25.4s, v3.s[0]\n"
+ "ldr q25, [x17, #0x40]\n"
+ "fmla v11.4s, v24.4s, v0.s[0]\n"
+ "fmla v15.4s, v24.4s, v1.s[0]\n"
+ "fmla v19.4s, v24.4s, v2.s[0]\n"
+ "fmla v23.4s, v24.4s, v3.s[0]\n"
+ "ldr q24, [x17, #0x50]\n"
+ "fmla v8.4s, v25.4s, v0.s[1]\n"
+ "fmla v12.4s, v25.4s, v1.s[1]\n"
+ "fmla v16.4s, v25.4s, v2.s[1]\n"
+ "fmla v20.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x17, #0x60]\n"
+ "fmla v9.4s, v24.4s, v0.s[1]\n"
+ "fmla v13.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v21.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x17, #0x70]\n"
+ "fmla v10.4s, v25.4s, v0.s[1]\n"
+ "fmla v14.4s, v25.4s, v1.s[1]\n"
+ "fmla v18.4s, v25.4s, v2.s[1]\n"
+ "fmla v22.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x17, #0x80]\n"
+ "fmla v11.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v1.s[1]\n"
+ "fmla v19.4s, v24.4s, v2.s[1]\n"
+ "fmla v23.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x17, #0x90]\n"
+ "fmla v8.4s, v25.4s, v0.s[2]\n"
+ "fmla v12.4s, v25.4s, v1.s[2]\n"
+ "fmla v16.4s, v25.4s, v2.s[2]\n"
+ "fmla v20.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x17, #0xa0]\n"
+ "fmla v9.4s, v24.4s, v0.s[2]\n"
+ "fmla v13.4s, v24.4s, v1.s[2]\n"
+ "fmla v17.4s, v24.4s, v2.s[2]\n"
+ "fmla v21.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x17, #0xb0]\n"
+ "fmla v10.4s, v25.4s, v0.s[2]\n"
+ "fmla v14.4s, v25.4s, v1.s[2]\n"
+ "fmla v18.4s, v25.4s, v2.s[2]\n"
+ "fmla v22.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x17, #0xc0]\n"
+ "fmla v11.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v1.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[2]\n"
+ "fmla v23.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x17, #0xd0]\n"
+ "fmla v8.4s, v25.4s, v0.s[3]\n"
+ "fmla v12.4s, v25.4s, v1.s[3]\n"
+ "fmla v16.4s, v25.4s, v2.s[3]\n"
+ "fmla v20.4s, v25.4s, v3.s[3]\n"
+ "ldr q25, [x17, #0xe0]\n"
+ "fmla v9.4s, v24.4s, v0.s[3]\n"
+ "fmla v13.4s, v24.4s, v1.s[3]\n"
+ "fmla v17.4s, v24.4s, v2.s[3]\n"
+ "fmla v21.4s, v24.4s, v3.s[3]\n"
+ "ldr q24, [x17, #0xf0]\n"
+ "fmla v10.4s, v25.4s, v0.s[3]\n"
"add x17, x17, #0x100\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v14.4s, v25.4s, v1.s[3]\n"
+ "fmla v18.4s, v25.4s, v2.s[3]\n"
+ "fmla v22.4s, v25.4s, v3.s[3]\n"
+ "fmla v11.4s, v24.4s, v0.s[3]\n"
+ "fmla v15.4s, v24.4s, v1.s[3]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "fmla v23.4s, v24.4s, v3.s[3]\n"
"119:" // Height 4: Multiply loop: Main loop skip
"cbz x14, 121f\n"
"120:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s29, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s28, [x12], #0x4\n"
+ "ldr s27, [x11], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr q25, [x17, #0x0]\n"
+ "fmla v8.4s, v25.4s, v29.s[0]\n"
+ "ldr q24, [x17, #0x10]\n"
+ "fmla v12.4s, v25.4s, v28.s[0]\n"
+ "fmla v16.4s, v25.4s, v27.s[0]\n"
+ "fmla v20.4s, v25.4s, v26.s[0]\n"
+ "ldr q25, [x17, #0x20]\n"
+ "fmla v9.4s, v24.4s, v29.s[0]\n"
+ "fmla v13.4s, v24.4s, v28.s[0]\n"
+ "fmla v17.4s, v24.4s, v27.s[0]\n"
+ "fmla v21.4s, v24.4s, v26.s[0]\n"
+ "ldr q24, [x17, #0x30]\n"
+ "fmla v10.4s, v25.4s, v29.s[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v14.4s, v25.4s, v28.s[0]\n"
+ "fmla v18.4s, v25.4s, v27.s[0]\n"
+ "fmla v22.4s, v25.4s, v26.s[0]\n"
+ "fmla v11.4s, v24.4s, v29.s[0]\n"
+ "fmla v15.4s, v24.4s, v28.s[0]\n"
+ "fmla v19.4s, v24.4s, v27.s[0]\n"
+ "fmla v23.4s, v24.4s, v26.s[0]\n"
"cbnz x14, 120b\n"
"121:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1831,41 +1830,41 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
- "fmin v14.4s, v14.4s, v0.4s\n"
- "fmin v15.4s, v15.4s, v0.4s\n"
- "fmin v16.4s, v16.4s, v0.4s\n"
- "fmin v17.4s, v17.4s, v0.4s\n"
- "fmin v18.4s, v18.4s, v0.4s\n"
- "fmin v19.4s, v19.4s, v0.4s\n"
- "fmin v20.4s, v20.4s, v0.4s\n"
- "fmin v21.4s, v21.4s, v0.4s\n"
- "fmin v22.4s, v22.4s, v0.4s\n"
- "fmin v23.4s, v23.4s, v0.4s\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v24.4s\n"
+ "fmin v9.4s, v9.4s, v24.4s\n"
+ "fmin v10.4s, v10.4s, v24.4s\n"
+ "fmin v11.4s, v11.4s, v24.4s\n"
+ "fmin v12.4s, v12.4s, v24.4s\n"
+ "fmin v13.4s, v13.4s, v24.4s\n"
+ "fmin v14.4s, v14.4s, v24.4s\n"
+ "fmin v15.4s, v15.4s, v24.4s\n"
+ "fmin v16.4s, v16.4s, v24.4s\n"
+ "fmin v17.4s, v17.4s, v24.4s\n"
+ "fmin v18.4s, v18.4s, v24.4s\n"
+ "fmin v19.4s, v19.4s, v24.4s\n"
+ "fmin v20.4s, v20.4s, v24.4s\n"
+ "fmin v21.4s, v21.4s, v24.4s\n"
+ "fmin v22.4s, v22.4s, v24.4s\n"
+ "fmin v23.4s, v23.4s, v24.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v24.4s\n"
+ "fmax v9.4s, v9.4s, v24.4s\n"
+ "fmax v10.4s, v10.4s, v24.4s\n"
+ "fmax v11.4s, v11.4s, v24.4s\n"
+ "fmax v12.4s, v12.4s, v24.4s\n"
+ "fmax v13.4s, v13.4s, v24.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v23.4s, v23.4s, v24.4s\n"
"122:" // Height 4: No activation
"cmp x8, #0x10\n"
"bge 131f\n"
@@ -2190,340 +2189,340 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"147:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 148f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
"cbnz x15, 149f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n"
+ "add x11, x11, x20, LSL #2\n"
+ "add x10, x10, x20, LSL #2\n"
"add x9, x9, x20, LSL #2\n"
- "add x27, x27, x20, LSL #2\n"
- "add x25, x25, x20, LSL #2\n"
- "add x23, x23, x20, LSL #2\n"
"b 149f\n"
"148:" // Height 5: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #2\n"
- "add x27, x9, x20, LSL #2\n"
- "add x25, x27, x20, LSL #2\n"
- "add x23, x25, x20, LSL #2\n"
+ "add x12, x13, x21, LSL #2\n"
+ "add x11, x12, x21, LSL #2\n"
+ "add x10, x11, x21, LSL #2\n"
+ "add x9, x10, x21, LSL #2\n"
"149:" // Height 5: input setup done
"cmp x14, #0x4\n"
"blt 152f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x8\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 151f\n"
"150:" // Height 5: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr d6, [x17, #0x20]\n"
+ "ldr d29, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v29.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x58]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "add x23, x23, #0x10\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr x10, [x13, #0x8]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr d6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x68]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr x28, [x9, #0x8]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr x26, [x27, #0x8]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x78]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr x24, [x25, #0x8]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr x22, [x23, #0x8]\n"
- "fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr d6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0x88]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr d28, [x17, #0x30]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmla v18.4s, v29.4s, v2.s[0]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.4s, v29.4s, v3.s[0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ "fmla v26.4s, v29.4s, v4.s[0]\n"
+ "ldr d29, [x17, #0x40]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla v19.4s, v28.4s, v2.s[0]\n"
+ "ldr x25, [x12, #0x8]\n"
+ "fmla v23.4s, v28.4s, v3.s[0]\n"
+ "ldr x24, [x11, #0x8]\n"
+ "fmla v27.4s, v28.4s, v4.s[0]\n"
+ "ldr d28, [x17, #0x50]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.4s, v29.4s, v0.s[1]\n"
+ "fmla v12.4s, v29.4s, v1.s[1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmla v16.4s, v29.4s, v2.s[1]\n"
+ "ldr x23, [x10, #0x8]\n"
+ "fmla v20.4s, v29.4s, v3.s[1]\n"
+ "ldr x22, [x9, #0x8]\n"
+ "fmla v24.4s, v29.4s, v4.s[1]\n"
+ "ldr d29, [x17, #0x60]\n"
+ "fmla v9.4s, v28.4s, v0.s[1]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.4s, v28.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla v17.4s, v28.4s, v2.s[1]\n"
"sub x14, x14, #0x4\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v21.4s, v28.4s, v3.s[1]\n"
"cmp x14, #0x8\n"
- "fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x98]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v25.4s, v28.4s, v4.s[1]\n"
+ "ldr d28, [x17, #0x70]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v14.4s, v29.4s, v1.s[1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla v18.4s, v29.4s, v2.s[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v22.4s, v29.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v26.4s, v29.4s, v4.s[1]\n"
+ "ldr d29, [x17, #0x80]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.4s, v28.4s, v1.s[1]\n"
+ "ldr x21, [x17, #0xa8]\n"
+ "fmla v19.4s, v28.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v23.4s, v28.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v27.4s, v28.4s, v4.s[1]\n"
+ "ldr d28, [x17, #0x90]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v12.4s, v29.4s, v1.s[2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- "fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr d6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0xa8]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xb8]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr d6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xc8]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xd8]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr d6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xe8]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0xf8]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr d6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "mov v6.d[1], x12\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "fmla v20.4s, v29.4s, v3.s[2]\n"
+ "fmla v24.4s, v29.4s, v4.s[2]\n"
+ "ldr d29, [x17, #0xa0]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.4s, v28.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla v17.4s, v28.4s, v2.s[2]\n"
+ "fmla v21.4s, v28.4s, v3.s[2]\n"
+ "fmla v25.4s, v28.4s, v4.s[2]\n"
+ "ldr d28, [x17, #0xb0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v10.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "ldr x20, [x17, #0xd8]\n"
+ "fmla v18.4s, v29.4s, v2.s[2]\n"
+ "fmla v22.4s, v29.4s, v3.s[2]\n"
+ "fmla v26.4s, v29.4s, v4.s[2]\n"
+ "ldr d29, [x17, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[2]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "ldr x21, [x17, #0xe8]\n"
+ "fmla v19.4s, v28.4s, v2.s[2]\n"
+ "fmla v23.4s, v28.4s, v3.s[2]\n"
+ "fmla v27.4s, v28.4s, v4.s[2]\n"
+ "ldr d28, [x17, #0xd0]\n"
+ "mov v28.d[1], x20\n"
+ "fmla v8.4s, v29.4s, v0.s[3]\n"
+ "fmla v12.4s, v29.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0xf8]\n"
+ "fmla v16.4s, v29.4s, v2.s[3]\n"
+ "fmla v20.4s, v29.4s, v3.s[3]\n"
+ "fmla v24.4s, v29.4s, v4.s[3]\n"
+ "ldr d29, [x17, #0xe0]\n"
+ "fmla v9.4s, v28.4s, v0.s[3]\n"
+ "mov v29.d[1], x21\n"
+ "fmla v13.4s, v28.4s, v1.s[3]\n"
+ "fmla v17.4s, v28.4s, v2.s[3]\n"
+ "fmla v21.4s, v28.4s, v3.s[3]\n"
+ "fmla v25.4s, v28.4s, v4.s[3]\n"
+ "ldr d28, [x17, #0xf0]\n"
+ "mov v28.d[1], x20\n"
"add x17, x17, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "ldr x12, [x17, #0x8]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0x18]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "ldr x20, [x17, #0x18]\n"
+ "fmla v18.4s, v29.4s, v2.s[3]\n"
+ "fmla v22.4s, v29.4s, v3.s[3]\n"
+ "fmla v26.4s, v29.4s, v4.s[3]\n"
"ldr d6, [x17, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
- "ldr d3, [x25, #0x0]\n"
- "fmla v27.4s, v7.4s, v4.s[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ "fmla v15.4s, v28.4s, v1.s[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "fmla v19.4s, v28.4s, v2.s[3]\n"
+ "ldr d2, [x11, #0x0]\n"
+ "fmla v23.4s, v28.4s, v3.s[3]\n"
+ "ldr d3, [x10, #0x0]\n"
+ "fmla v27.4s, v28.4s, v4.s[3]\n"
+ "ldr d4, [x9, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x26\n"
+ "mov v1.d[1], x25\n"
+ "mov v2.d[1], x24\n"
+ "mov v3.d[1], x23\n"
"mov v4.d[1], x22\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"bge 150b\n"
"151:" // Height 5: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
+ "ldr q29, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"sub x14, x14, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x17, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x17, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x17, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x17, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x17, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x17, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x17, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x17, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x17, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x17, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x17, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x17, #0xf0]\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "ldr q28, [x17, #0x30]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v18.4s, v29.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v22.4s, v29.4s, v3.s[0]\n"
+ "fmla v26.4s, v29.4s, v4.s[0]\n"
+ "ldr q29, [x17, #0x40]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v2.s[0]\n"
+ "fmla v23.4s, v28.4s, v3.s[0]\n"
+ "fmla v27.4s, v28.4s, v4.s[0]\n"
+ "ldr q28, [x17, #0x50]\n"
+ "fmla v8.4s, v29.4s, v0.s[1]\n"
+ "fmla v12.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[1]\n"
+ "fmla v20.4s, v29.4s, v3.s[1]\n"
+ "fmla v24.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x17, #0x60]\n"
+ "fmla v9.4s, v28.4s, v0.s[1]\n"
+ "fmla v13.4s, v28.4s, v1.s[1]\n"
+ "fmla v17.4s, v28.4s, v2.s[1]\n"
+ "fmla v21.4s, v28.4s, v3.s[1]\n"
+ "fmla v25.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x17, #0x70]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v14.4s, v29.4s, v1.s[1]\n"
+ "fmla v18.4s, v29.4s, v2.s[1]\n"
+ "fmla v22.4s, v29.4s, v3.s[1]\n"
+ "fmla v26.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x17, #0x80]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[1]\n"
+ "fmla v19.4s, v28.4s, v2.s[1]\n"
+ "fmla v23.4s, v28.4s, v3.s[1]\n"
+ "fmla v27.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x17, #0x90]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v12.4s, v29.4s, v1.s[2]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v20.4s, v29.4s, v3.s[2]\n"
+ "fmla v24.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x17, #0xa0]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v13.4s, v28.4s, v1.s[2]\n"
+ "fmla v17.4s, v28.4s, v2.s[2]\n"
+ "fmla v21.4s, v28.4s, v3.s[2]\n"
+ "fmla v25.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x17, #0xb0]\n"
+ "fmla v10.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v18.4s, v29.4s, v2.s[2]\n"
+ "fmla v22.4s, v29.4s, v3.s[2]\n"
+ "fmla v26.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x17, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[2]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v19.4s, v28.4s, v2.s[2]\n"
+ "fmla v23.4s, v28.4s, v3.s[2]\n"
+ "fmla v27.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x17, #0xd0]\n"
+ "fmla v8.4s, v29.4s, v0.s[3]\n"
+ "fmla v12.4s, v29.4s, v1.s[3]\n"
+ "fmla v16.4s, v29.4s, v2.s[3]\n"
+ "fmla v20.4s, v29.4s, v3.s[3]\n"
+ "fmla v24.4s, v29.4s, v4.s[3]\n"
+ "ldr q29, [x17, #0xe0]\n"
+ "fmla v9.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[3]\n"
+ "fmla v17.4s, v28.4s, v2.s[3]\n"
+ "fmla v21.4s, v28.4s, v3.s[3]\n"
+ "fmla v25.4s, v28.4s, v4.s[3]\n"
+ "ldr q28, [x17, #0xf0]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
"add x17, x17, #0x100\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v26.4s, v6.4s, v4.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
- "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "fmla v18.4s, v29.4s, v2.s[3]\n"
+ "fmla v22.4s, v29.4s, v3.s[3]\n"
+ "fmla v26.4s, v29.4s, v4.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "fmla v15.4s, v28.4s, v1.s[3]\n"
+ "fmla v19.4s, v28.4s, v2.s[3]\n"
+ "fmla v23.4s, v28.4s, v3.s[3]\n"
+ "fmla v27.4s, v28.4s, v4.s[3]\n"
"152:" // Height 5: Multiply loop: Main loop skip
"cbz x14, 154f\n"
"153:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s2, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s0, [x11], #0x4\n"
+ "ldr s31, [x10], #0x4\n"
+ "ldr s30, [x9], #0x4\n"
+ "ldr q29, [x17, #0x0]\n"
+ "fmla v8.4s, v29.4s, v2.s[0]\n"
+ "ldr q28, [x17, #0x10]\n"
+ "fmla v12.4s, v29.4s, v1.s[0]\n"
+ "fmla v16.4s, v29.4s, v0.s[0]\n"
+ "fmla v20.4s, v29.4s, v31.s[0]\n"
+ "fmla v24.4s, v29.4s, v30.s[0]\n"
+ "ldr q29, [x17, #0x20]\n"
+ "fmla v9.4s, v28.4s, v2.s[0]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v17.4s, v28.4s, v0.s[0]\n"
+ "fmla v21.4s, v28.4s, v31.s[0]\n"
+ "fmla v25.4s, v28.4s, v30.s[0]\n"
+ "ldr q28, [x17, #0x30]\n"
+ "fmla v10.4s, v29.4s, v2.s[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v18.4s, v29.4s, v0.s[0]\n"
+ "fmla v22.4s, v29.4s, v31.s[0]\n"
+ "fmla v26.4s, v29.4s, v30.s[0]\n"
+ "fmla v11.4s, v28.4s, v2.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v0.s[0]\n"
+ "fmla v23.4s, v28.4s, v31.s[0]\n"
+ "fmla v27.4s, v28.4s, v30.s[0]\n"
"cbnz x14, 153b\n"
"154:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2542,49 +2541,49 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 155f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v0.4s\n"
- "fmin v9.4s, v9.4s, v0.4s\n"
- "fmin v10.4s, v10.4s, v0.4s\n"
- "fmin v11.4s, v11.4s, v0.4s\n"
- "fmin v12.4s, v12.4s, v0.4s\n"
- "fmin v13.4s, v13.4s, v0.4s\n"
- "fmin v14.4s, v14.4s, v0.4s\n"
- "fmin v15.4s, v15.4s, v0.4s\n"
- "fmin v16.4s, v16.4s, v0.4s\n"
- "fmin v17.4s, v17.4s, v0.4s\n"
- "fmin v18.4s, v18.4s, v0.4s\n"
- "fmin v19.4s, v19.4s, v0.4s\n"
- "fmin v20.4s, v20.4s, v0.4s\n"
- "fmin v21.4s, v21.4s, v0.4s\n"
- "fmin v22.4s, v22.4s, v0.4s\n"
- "fmin v23.4s, v23.4s, v0.4s\n"
- "fmin v24.4s, v24.4s, v0.4s\n"
- "fmin v25.4s, v25.4s, v0.4s\n"
- "fmin v26.4s, v26.4s, v0.4s\n"
- "fmin v27.4s, v27.4s, v0.4s\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v28.4s\n"
+ "fmin v9.4s, v9.4s, v28.4s\n"
+ "fmin v10.4s, v10.4s, v28.4s\n"
+ "fmin v11.4s, v11.4s, v28.4s\n"
+ "fmin v12.4s, v12.4s, v28.4s\n"
+ "fmin v13.4s, v13.4s, v28.4s\n"
+ "fmin v14.4s, v14.4s, v28.4s\n"
+ "fmin v15.4s, v15.4s, v28.4s\n"
+ "fmin v16.4s, v16.4s, v28.4s\n"
+ "fmin v17.4s, v17.4s, v28.4s\n"
+ "fmin v18.4s, v18.4s, v28.4s\n"
+ "fmin v19.4s, v19.4s, v28.4s\n"
+ "fmin v20.4s, v20.4s, v28.4s\n"
+ "fmin v21.4s, v21.4s, v28.4s\n"
+ "fmin v22.4s, v22.4s, v28.4s\n"
+ "fmin v23.4s, v23.4s, v28.4s\n"
+ "fmin v24.4s, v24.4s, v28.4s\n"
+ "fmin v25.4s, v25.4s, v28.4s\n"
+ "fmin v26.4s, v26.4s, v28.4s\n"
+ "fmin v27.4s, v27.4s, v28.4s\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v0.4s\n"
- "fmax v26.4s, v26.4s, v0.4s\n"
- "fmax v27.4s, v27.4s, v0.4s\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "fmax v8.4s, v8.4s, v28.4s\n"
+ "fmax v9.4s, v9.4s, v28.4s\n"
+ "fmax v10.4s, v10.4s, v28.4s\n"
+ "fmax v11.4s, v11.4s, v28.4s\n"
+ "fmax v12.4s, v12.4s, v28.4s\n"
+ "fmax v13.4s, v13.4s, v28.4s\n"
+ "fmax v14.4s, v14.4s, v28.4s\n"
+ "fmax v15.4s, v15.4s, v28.4s\n"
+ "fmax v16.4s, v16.4s, v28.4s\n"
+ "fmax v17.4s, v17.4s, v28.4s\n"
+ "fmax v18.4s, v18.4s, v28.4s\n"
+ "fmax v19.4s, v19.4s, v28.4s\n"
+ "fmax v20.4s, v20.4s, v28.4s\n"
+ "fmax v21.4s, v21.4s, v28.4s\n"
+ "fmax v22.4s, v22.4s, v28.4s\n"
+ "fmax v23.4s, v23.4s, v28.4s\n"
+ "fmax v24.4s, v24.4s, v28.4s\n"
+ "fmax v25.4s, v25.4s, v28.4s\n"
+ "fmax v26.4s, v26.4s, v28.4s\n"
+ "fmax v27.4s, v27.4s, v28.4s\n"
"155:" // Height 5: No activation
"cmp x8, #0x10\n"
"bge 164f\n"
@@ -2961,98 +2960,98 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"180:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 181f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
+ "ldr x28, [x20, #0x28]\n"
"cbnz x15, 182f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n"
+ "add x11, x11, x20, LSL #2\n"
+ "add x10, x10, x20, LSL #2\n"
"add x9, x9, x20, LSL #2\n"
- "add x27, x27, x20, LSL #2\n"
- "add x25, x25, x20, LSL #2\n"
- "add x23, x23, x20, LSL #2\n"
- "add x21, x21, x20, LSL #2\n"
+ "add x28, x28, x20, LSL #2\n"
"b 182f\n"
"181:" // Height 6: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20, LSL #2\n"
- "add x27, x9, x20, LSL #2\n"
- "add x25, x27, x20, LSL #2\n"
- "add x23, x25, x20, LSL #2\n"
- "add x21, x23, x20, LSL #2\n"
+ "add x12, x13, x21, LSL #2\n"
+ "add x11, x12, x21, LSL #2\n"
+ "add x10, x11, x21, LSL #2\n"
+ "add x9, x10, x21, LSL #2\n"
+ "add x28, x9, x21, LSL #2\n"
"182:" // Height 6: input setup done
"cmp x14, #0x4\n"
"blt 185f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x8\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x21, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
+ "ldr q5, [x28, #0x0]\n"
"ldr q6, [x17, #0x0]\n"
"ldr q7, [x17, #0x10]\n"
"blt 184f\n"
"183:" // Height 6: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr x12, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x28]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x38]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v28.4s, v6.4s, v5.s[0]\n"
"ldr d6, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x48]\n"
+ "ldr x21, [x17, #0x48]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
"ldr d7, [x17, #0x30]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr x11, [x17, #0x58]\n"
+ "ldr x20, [x17, #0x58]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr x10, [x13, #0x8]\n"
+ "ldr x27, [x13, #0x8]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x26, [x12, #0x8]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr x25, [x11, #0x8]\n"
"fmla v30.4s, v6.4s, v5.s[0]\n"
"ldr d6, [x17, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr x12, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x68]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr x24, [x25, #0x8]\n"
+ "ldr x24, [x10, #0x8]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr x22, [x23, #0x8]\n"
+ "ldr x23, [x9, #0x8]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr x20, [x21, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
"fmla v31.4s, v7.4s, v5.s[0]\n"
"ldr d7, [x17, #0x50]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x78]\n"
+ "ldr x20, [x17, #0x78]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"sub x14, x14, #0x4\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
@@ -3062,96 +3061,96 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v28.4s, v6.4s, v5.s[1]\n"
"ldr d6, [x17, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0x88]\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v25.4s, v7.4s, v4.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v29.4s, v7.4s, v5.s[1]\n"
"ldr d7, [x17, #0x70]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr x11, [x17, #0x98]\n"
+ "ldr x20, [x17, #0x98]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v26.4s, v6.4s, v4.s[1]\n"
"fmla v30.4s, v6.4s, v5.s[1]\n"
"ldr d6, [x17, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr x12, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xa8]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
"fmla v27.4s, v7.4s, v4.s[1]\n"
"fmla v31.4s, v7.4s, v5.s[1]\n"
"ldr d7, [x17, #0x90]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xb8]\n"
+ "ldr x20, [x17, #0xb8]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
"fmla v24.4s, v6.4s, v4.s[2]\n"
"fmla v28.4s, v6.4s, v5.s[2]\n"
"ldr d6, [x17, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xc8]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
"fmla v25.4s, v7.4s, v4.s[2]\n"
"fmla v29.4s, v7.4s, v5.s[2]\n"
"ldr d7, [x17, #0xb0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr x11, [x17, #0xd8]\n"
+ "ldr x20, [x17, #0xd8]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
"fmla v26.4s, v6.4s, v4.s[2]\n"
"fmla v30.4s, v6.4s, v5.s[2]\n"
"ldr d6, [x17, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr x12, [x17, #0xe8]\n"
+ "ldr x21, [x17, #0xe8]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
"fmla v27.4s, v7.4s, v4.s[2]\n"
"fmla v31.4s, v7.4s, v5.s[2]\n"
"ldr d7, [x17, #0xd0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0xf8]\n"
+ "ldr x20, [x17, #0xf8]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
"fmla v24.4s, v6.4s, v4.s[3]\n"
"fmla v28.4s, v6.4s, v5.s[3]\n"
"ldr d6, [x17, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
"fmla v25.4s, v7.4s, v4.s[3]\n"
"fmla v29.4s, v7.4s, v5.s[3]\n"
"ldr d7, [x17, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"add x17, x17, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
- "ldr x12, [x17, #0x8]\n"
+ "ldr x21, [x17, #0x8]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
- "ldr x11, [x17, #0x18]\n"
+ "ldr x20, [x17, #0x18]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
"fmla v26.4s, v6.4s, v4.s[3]\n"
@@ -3160,56 +3159,56 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"fmla v11.4s, v7.4s, v0.s[3]\n"
"ldr d0, [x13, #0x0]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x12, #0x0]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x11, #0x0]\n"
"fmla v23.4s, v7.4s, v3.s[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ "ldr d3, [x10, #0x0]\n"
"fmla v27.4s, v7.4s, v4.s[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ "ldr d4, [x9, #0x0]\n"
"fmla v31.4s, v7.4s, v5.s[3]\n"
- "ldr d5, [x21, #0x0]\n"
+ "ldr d5, [x28, #0x0]\n"
"ldr d7, [x17, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x26\n"
+ "mov v2.d[1], x25\n"
"mov v3.d[1], x24\n"
- "mov v4.d[1], x22\n"
- "mov v5.d[1], x20\n"
- "mov v7.d[1], x11\n"
+ "mov v4.d[1], x23\n"
+ "mov v5.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 183b\n"
"184:" // Height 6: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"add x13, x13, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v28.4s, v6.4s, v5.s[0]\n"
"ldr q6, [x17, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"sub x14, x14, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
"ldr q7, [x17, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
"fmla v30.4s, v6.4s, v5.s[0]\n"
@@ -3307,42 +3306,42 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"185:" // Height 6: Multiply loop: Main loop skip
"cbz x14, 187f\n"
"186:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
"sub x14, x14, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x17, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x17, #0x10]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "fmla v28.4s, v6.4s, v5.s[0]\n"
- "ldr q6, [x17, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v4.s[0]\n"
- "fmla v29.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x17, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr s6, [x12], #0x4\n"
+ "ldr s5, [x11], #0x4\n"
+ "ldr s4, [x10], #0x4\n"
+ "ldr s3, [x9], #0x4\n"
+ "ldr s2, [x28], #0x4\n"
+ "ldr q1, [x17, #0x0]\n"
+ "fmla v8.4s, v1.4s, v7.s[0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v12.4s, v1.4s, v6.s[0]\n"
+ "fmla v16.4s, v1.4s, v5.s[0]\n"
+ "fmla v20.4s, v1.4s, v4.s[0]\n"
+ "fmla v24.4s, v1.4s, v3.s[0]\n"
+ "fmla v28.4s, v1.4s, v2.s[0]\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmla v9.4s, v0.4s, v7.s[0]\n"
+ "fmla v13.4s, v0.4s, v6.s[0]\n"
+ "fmla v17.4s, v0.4s, v5.s[0]\n"
+ "fmla v21.4s, v0.4s, v4.s[0]\n"
+ "fmla v25.4s, v0.4s, v3.s[0]\n"
+ "fmla v29.4s, v0.4s, v2.s[0]\n"
+ "ldr q0, [x17, #0x30]\n"
+ "fmla v10.4s, v1.4s, v7.s[0]\n"
"add x17, x17, #0x40\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "fmla v30.4s, v6.4s, v5.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "fmla v14.4s, v1.4s, v6.s[0]\n"
+ "fmla v18.4s, v1.4s, v5.s[0]\n"
+ "fmla v22.4s, v1.4s, v4.s[0]\n"
+ "fmla v26.4s, v1.4s, v3.s[0]\n"
+ "fmla v30.4s, v1.4s, v2.s[0]\n"
+ "fmla v11.4s, v0.4s, v7.s[0]\n"
+ "fmla v15.4s, v0.4s, v6.s[0]\n"
+ "fmla v19.4s, v0.4s, v5.s[0]\n"
+ "fmla v23.4s, v0.4s, v4.s[0]\n"
+ "fmla v27.4s, v0.4s, v3.s[0]\n"
+ "fmla v31.4s, v0.4s, v2.s[0]\n"
"cbnz x14, 186b\n"
"187:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3584,7 +3583,6 @@ void a64_hybrid_fp32_mla_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"200:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
index c5e4388aa9..bb84a50282 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_6x16 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 166f\n"
@@ -189,11 +188,11 @@ void a64_hybrid_fp32_mla_6x16 (
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -210,37 +209,37 @@ void a64_hybrid_fp32_mla_6x16 (
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"sub x27, x27, #0x4\n"
"add x26, x26, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
"cmp x27, #0x8\n"
"add x10, x10, #0x100\n"
@@ -250,52 +249,52 @@ void a64_hybrid_fp32_mla_6x16 (
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x4\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
"cbz x27, 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v8.4s, v16.4s, v18.s[0]\n"
"sub x27, x27, #0x1\n"
- "ldr q7, [x10, #0x10]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "ldr q16, [x10, #0x20]\n"
+ "fmla v9.4s, v17.4s, v18.s[0]\n"
+ "fmla v10.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v11.4s, v16.4s, v18.s[0]\n"
"add x10, x10, #0x40\n"
"cbnz x27, 21b\n"
"22:" // Height 1: Multiply loop: No odd multiplies
@@ -306,17 +305,17 @@ void a64_hybrid_fp32_mla_6x16 (
"prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 23f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
"23:" // Height 1: No activation
"cmp x11, #0x10\n"
"bge 32f\n"
@@ -494,12 +493,12 @@ void a64_hybrid_fp32_mla_6x16 (
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -507,7 +506,7 @@ void a64_hybrid_fp32_mla_6x16 (
"b 50f\n"
"49:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"50:" // Height 2: input setup done
"cmp x27, #0x4\n"
"blt 53f\n"
@@ -520,134 +519,134 @@ void a64_hybrid_fp32_mla_6x16 (
"51:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"sub x27, x27, #0x4\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "fmla v14.4s, v17.4s, v1.s[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"add x25, x25, #0x10\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "fmla v15.4s, v16.4s, v1.s[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"cmp x27, #0x8\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "fmla v12.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "fmla v13.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x10, #0x70]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "fmla v14.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "fmla v15.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "fmla v12.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "fmla v13.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "fmla v14.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "fmla v15.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v1.s[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v1.s[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v14.4s, v17.4s, v1.s[3]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v16.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"add x26, x26, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v10.4s, v17.4s, v0.s[0]\n"
+ "fmla v14.4s, v17.4s, v1.s[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"sub x27, x27, #0x4\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "fmla v11.4s, v16.4s, v0.s[0]\n"
+ "fmla v15.4s, v16.4s, v1.s[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ "fmla v8.4s, v17.4s, v0.s[1]\n"
+ "fmla v12.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v9.4s, v16.4s, v0.s[1]\n"
+ "fmla v13.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ "fmla v10.4s, v17.4s, v0.s[1]\n"
+ "fmla v14.4s, v17.4s, v1.s[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ "fmla v11.4s, v16.4s, v0.s[1]\n"
+ "fmla v15.4s, v16.4s, v1.s[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ "fmla v8.4s, v17.4s, v0.s[2]\n"
+ "fmla v12.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ "fmla v9.4s, v16.4s, v0.s[2]\n"
+ "fmla v13.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ "fmla v10.4s, v17.4s, v0.s[2]\n"
+ "fmla v14.4s, v17.4s, v1.s[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ "fmla v11.4s, v16.4s, v0.s[2]\n"
+ "fmla v15.4s, v16.4s, v1.s[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ "fmla v8.4s, v17.4s, v0.s[3]\n"
+ "fmla v12.4s, v17.4s, v1.s[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ "fmla v9.4s, v16.4s, v0.s[3]\n"
+ "fmla v13.4s, v16.4s, v1.s[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v10.4s, v17.4s, v0.s[3]\n"
+ "fmla v14.4s, v17.4s, v1.s[3]\n"
+ "fmla v11.4s, v16.4s, v0.s[3]\n"
+ "fmla v15.4s, v16.4s, v1.s[3]\n"
"53:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 55f\n"
"54:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ "fmla v8.4s, v17.4s, v19.s[0]\n"
+ "fmla v12.4s, v17.4s, v18.s[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "fmla v9.4s, v16.4s, v19.s[0]\n"
+ "fmla v13.4s, v16.4s, v18.s[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ "fmla v10.4s, v17.4s, v19.s[0]\n"
+ "fmla v14.4s, v17.4s, v18.s[0]\n"
"add x10, x10, #0x40\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v11.4s, v16.4s, v19.s[0]\n"
+ "fmla v15.4s, v16.4s, v18.s[0]\n"
"cbnz x27, 54b\n"
"55:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -660,25 +659,25 @@ void a64_hybrid_fp32_mla_6x16 (
"prfm pstl1keep, [x25, #0x0]\n"
"tbz %x[flags], #1, 56f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v17.4s\n"
+ "fmin v9.4s, v9.4s, v17.4s\n"
+ "fmin v10.4s, v10.4s, v17.4s\n"
+ "fmin v11.4s, v11.4s, v17.4s\n"
+ "fmin v12.4s, v12.4s, v17.4s\n"
+ "fmin v13.4s, v13.4s, v17.4s\n"
+ "fmin v14.4s, v14.4s, v17.4s\n"
+ "fmin v15.4s, v15.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v16.4s\n"
+ "fmax v11.4s, v11.4s, v16.4s\n"
+ "fmax v12.4s, v12.4s, v16.4s\n"
+ "fmax v13.4s, v13.4s, v16.4s\n"
+ "fmax v14.4s, v14.4s, v16.4s\n"
+ "fmax v15.4s, v15.4s, v16.4s\n"
"56:" // Height 2: No activation
"cmp x11, #0x10\n"
"bge 65f\n"
@@ -905,13 +904,13 @@ void a64_hybrid_fp32_mla_6x16 (
"81:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 82f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 83f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -920,8 +919,8 @@ void a64_hybrid_fp32_mla_6x16 (
"b 83f\n"
"82:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"83:" // Height 3: input setup done
"cmp x27, #0x4\n"
"blt 86f\n"
@@ -938,75 +937,75 @@ void a64_hybrid_fp32_mla_6x16 (
"sub x27, x27, #0x4\n"
"add x26, x26, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x25, x25, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v21.4s, v0.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[0]\n"
"cmp x27, #0x8\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v18.4s, v21.4s, v2.s[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ "fmla v11.4s, v20.4s, v0.s[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ "fmla v15.4s, v20.4s, v1.s[0]\n"
+ "fmla v19.4s, v20.4s, v2.s[0]\n"
+ "ldr q20, [x10, #0x50]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v12.4s, v21.4s, v1.s[1]\n"
+ "fmla v16.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v17.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ "fmla v10.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v18.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ "fmla v11.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "fmla v19.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ "fmla v8.4s, v21.4s, v0.s[2]\n"
+ "fmla v12.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[2]\n"
+ "fmla v13.4s, v20.4s, v1.s[2]\n"
+ "fmla v17.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v21.4s, v1.s[2]\n"
+ "fmla v18.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "fmla v15.4s, v20.4s, v1.s[2]\n"
+ "fmla v19.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v12.4s, v21.4s, v1.s[3]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "fmla v13.4s, v20.4s, v1.s[3]\n"
+ "fmla v17.4s, v20.4s, v2.s[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v10.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "fmla v18.4s, v21.4s, v2.s[3]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v20.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v19.4s, v20.4s, v2.s[3]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 84b\n"
@@ -1016,95 +1015,95 @@ void a64_hybrid_fp32_mla_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x24, x24, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x4\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v21.4s, v0.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v18.4s, v21.4s, v2.s[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ "fmla v11.4s, v20.4s, v0.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v15.4s, v20.4s, v1.s[0]\n"
+ "fmla v19.4s, v20.4s, v2.s[0]\n"
+ "ldr q20, [x10, #0x50]\n"
+ "fmla v8.4s, v21.4s, v0.s[1]\n"
+ "fmla v12.4s, v21.4s, v1.s[1]\n"
+ "fmla v16.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ "fmla v9.4s, v20.4s, v0.s[1]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v17.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ "fmla v10.4s, v21.4s, v0.s[1]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v18.4s, v21.4s, v2.s[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ "fmla v11.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v20.4s, v1.s[1]\n"
+ "fmla v19.4s, v20.4s, v2.s[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ "fmla v8.4s, v21.4s, v0.s[2]\n"
+ "fmla v12.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ "fmla v9.4s, v20.4s, v0.s[2]\n"
+ "fmla v13.4s, v20.4s, v1.s[2]\n"
+ "fmla v17.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ "fmla v10.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v21.4s, v1.s[2]\n"
+ "fmla v18.4s, v21.4s, v2.s[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ "fmla v11.4s, v20.4s, v0.s[2]\n"
+ "fmla v15.4s, v20.4s, v1.s[2]\n"
+ "fmla v19.4s, v20.4s, v2.s[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ "fmla v8.4s, v21.4s, v0.s[3]\n"
+ "fmla v12.4s, v21.4s, v1.s[3]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ "fmla v9.4s, v20.4s, v0.s[3]\n"
+ "fmla v13.4s, v20.4s, v1.s[3]\n"
+ "fmla v17.4s, v20.4s, v2.s[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v10.4s, v21.4s, v0.s[3]\n"
+ "fmla v14.4s, v21.4s, v1.s[3]\n"
+ "fmla v18.4s, v21.4s, v2.s[3]\n"
+ "fmla v11.4s, v20.4s, v0.s[3]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "fmla v19.4s, v20.4s, v2.s[3]\n"
"86:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 88f\n"
"87:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x10, #0x0]\n"
+ "fmla v8.4s, v21.4s, v24.s[0]\n"
+ "fmla v12.4s, v21.4s, v23.s[0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ "fmla v16.4s, v21.4s, v22.s[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "fmla v9.4s, v20.4s, v24.s[0]\n"
+ "fmla v13.4s, v20.4s, v23.s[0]\n"
+ "fmla v17.4s, v20.4s, v22.s[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v10.4s, v21.4s, v24.s[0]\n"
+ "fmla v14.4s, v21.4s, v23.s[0]\n"
+ "fmla v18.4s, v21.4s, v22.s[0]\n"
+ "fmla v11.4s, v20.4s, v24.s[0]\n"
+ "fmla v15.4s, v20.4s, v23.s[0]\n"
+ "fmla v19.4s, v20.4s, v22.s[0]\n"
"cbnz x27, 87b\n"
"88:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1119,33 +1118,33 @@ void a64_hybrid_fp32_mla_6x16 (
"prfm pstl1keep, [x24, #0x0]\n"
"tbz %x[flags], #1, 89f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v21.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v21.4s\n"
+ "fmin v9.4s, v9.4s, v21.4s\n"
+ "fmin v10.4s, v10.4s, v21.4s\n"
+ "fmin v11.4s, v11.4s, v21.4s\n"
+ "fmin v12.4s, v12.4s, v21.4s\n"
+ "fmin v13.4s, v13.4s, v21.4s\n"
+ "fmin v14.4s, v14.4s, v21.4s\n"
+ "fmin v15.4s, v15.4s, v21.4s\n"
+ "fmin v16.4s, v16.4s, v21.4s\n"
+ "fmin v17.4s, v17.4s, v21.4s\n"
+ "fmin v18.4s, v18.4s, v21.4s\n"
+ "fmin v19.4s, v19.4s, v21.4s\n"
+ "fmax v8.4s, v8.4s, v20.4s\n"
+ "fmax v9.4s, v9.4s, v20.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v20.4s\n"
+ "fmax v14.4s, v14.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v20.4s\n"
+ "fmax v16.4s, v16.4s, v20.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v20.4s\n"
+ "fmax v19.4s, v19.4s, v20.4s\n"
"89:" // Height 3: No activation
"cmp x11, #0x10\n"
"bge 98f\n"
@@ -1421,14 +1420,14 @@ void a64_hybrid_fp32_mla_6x16 (
"114:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 116f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1438,9 +1437,9 @@ void a64_hybrid_fp32_mla_6x16 (
"b 116f\n"
"115:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"116:" // Height 4: input setup done
"cmp x27, #0x4\n"
"blt 119f\n"
@@ -1459,7 +1458,7 @@ void a64_hybrid_fp32_mla_6x16 (
"add x26, x26, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x25, x25, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1467,85 +1466,85 @@ void a64_hybrid_fp32_mla_6x16 (
"add x23, x23, #0x10\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"cmp x27, #0x8\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v25.4s, v0.s[0]\n"
+ "fmla v14.4s, v25.4s, v1.s[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v18.4s, v25.4s, v2.s[0]\n"
+ "fmla v22.4s, v25.4s, v3.s[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v11.4s, v24.4s, v0.s[0]\n"
+ "fmla v15.4s, v24.4s, v1.s[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v19.4s, v24.4s, v2.s[0]\n"
+ "fmla v23.4s, v24.4s, v3.s[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ "fmla v8.4s, v25.4s, v0.s[1]\n"
+ "fmla v12.4s, v25.4s, v1.s[1]\n"
+ "fmla v16.4s, v25.4s, v2.s[1]\n"
+ "fmla v20.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ "fmla v9.4s, v24.4s, v0.s[1]\n"
+ "fmla v13.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v21.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ "fmla v10.4s, v25.4s, v0.s[1]\n"
+ "fmla v14.4s, v25.4s, v1.s[1]\n"
+ "fmla v18.4s, v25.4s, v2.s[1]\n"
+ "fmla v22.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ "fmla v11.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v1.s[1]\n"
+ "fmla v19.4s, v24.4s, v2.s[1]\n"
+ "fmla v23.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ "fmla v8.4s, v25.4s, v0.s[2]\n"
+ "fmla v12.4s, v25.4s, v1.s[2]\n"
+ "fmla v16.4s, v25.4s, v2.s[2]\n"
+ "fmla v20.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ "fmla v9.4s, v24.4s, v0.s[2]\n"
+ "fmla v13.4s, v24.4s, v1.s[2]\n"
+ "fmla v17.4s, v24.4s, v2.s[2]\n"
+ "fmla v21.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ "fmla v10.4s, v25.4s, v0.s[2]\n"
+ "fmla v14.4s, v25.4s, v1.s[2]\n"
+ "fmla v18.4s, v25.4s, v2.s[2]\n"
+ "fmla v22.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ "fmla v11.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v1.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[2]\n"
+ "fmla v23.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ "fmla v8.4s, v25.4s, v0.s[3]\n"
+ "fmla v12.4s, v25.4s, v1.s[3]\n"
+ "fmla v16.4s, v25.4s, v2.s[3]\n"
+ "fmla v20.4s, v25.4s, v3.s[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ "fmla v9.4s, v24.4s, v0.s[3]\n"
+ "fmla v13.4s, v24.4s, v1.s[3]\n"
+ "fmla v17.4s, v24.4s, v2.s[3]\n"
+ "fmla v21.4s, v24.4s, v3.s[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v10.4s, v25.4s, v0.s[3]\n"
+ "fmla v14.4s, v25.4s, v1.s[3]\n"
+ "fmla v18.4s, v25.4s, v2.s[3]\n"
+ "fmla v22.4s, v25.4s, v3.s[3]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v24.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v24.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v23.4s, v24.4s, v3.s[3]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 117b\n"
@@ -1556,7 +1555,7 @@ void a64_hybrid_fp32_mla_6x16 (
"add x25, x25, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x24, x24, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1564,109 +1563,109 @@ void a64_hybrid_fp32_mla_6x16 (
"sub x27, x27, #0x4\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v25.4s, v0.s[0]\n"
+ "fmla v14.4s, v25.4s, v1.s[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ "fmla v18.4s, v25.4s, v2.s[0]\n"
+ "fmla v22.4s, v25.4s, v3.s[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v11.4s, v24.4s, v0.s[0]\n"
+ "fmla v15.4s, v24.4s, v1.s[0]\n"
+ "fmla v19.4s, v24.4s, v2.s[0]\n"
+ "fmla v23.4s, v24.4s, v3.s[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ "fmla v8.4s, v25.4s, v0.s[1]\n"
+ "fmla v12.4s, v25.4s, v1.s[1]\n"
+ "fmla v16.4s, v25.4s, v2.s[1]\n"
+ "fmla v20.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ "fmla v9.4s, v24.4s, v0.s[1]\n"
+ "fmla v13.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v21.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ "fmla v10.4s, v25.4s, v0.s[1]\n"
+ "fmla v14.4s, v25.4s, v1.s[1]\n"
+ "fmla v18.4s, v25.4s, v2.s[1]\n"
+ "fmla v22.4s, v25.4s, v3.s[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ "fmla v11.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v1.s[1]\n"
+ "fmla v19.4s, v24.4s, v2.s[1]\n"
+ "fmla v23.4s, v24.4s, v3.s[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ "fmla v8.4s, v25.4s, v0.s[2]\n"
+ "fmla v12.4s, v25.4s, v1.s[2]\n"
+ "fmla v16.4s, v25.4s, v2.s[2]\n"
+ "fmla v20.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ "fmla v9.4s, v24.4s, v0.s[2]\n"
+ "fmla v13.4s, v24.4s, v1.s[2]\n"
+ "fmla v17.4s, v24.4s, v2.s[2]\n"
+ "fmla v21.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ "fmla v10.4s, v25.4s, v0.s[2]\n"
+ "fmla v14.4s, v25.4s, v1.s[2]\n"
+ "fmla v18.4s, v25.4s, v2.s[2]\n"
+ "fmla v22.4s, v25.4s, v3.s[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ "fmla v11.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v1.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[2]\n"
+ "fmla v23.4s, v24.4s, v3.s[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ "fmla v8.4s, v25.4s, v0.s[3]\n"
+ "fmla v12.4s, v25.4s, v1.s[3]\n"
+ "fmla v16.4s, v25.4s, v2.s[3]\n"
+ "fmla v20.4s, v25.4s, v3.s[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ "fmla v9.4s, v24.4s, v0.s[3]\n"
+ "fmla v13.4s, v24.4s, v1.s[3]\n"
+ "fmla v17.4s, v24.4s, v2.s[3]\n"
+ "fmla v21.4s, v24.4s, v3.s[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v10.4s, v25.4s, v0.s[3]\n"
+ "fmla v14.4s, v25.4s, v1.s[3]\n"
+ "fmla v18.4s, v25.4s, v2.s[3]\n"
+ "fmla v22.4s, v25.4s, v3.s[3]\n"
+ "fmla v11.4s, v24.4s, v0.s[3]\n"
+ "fmla v15.4s, v24.4s, v1.s[3]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "fmla v23.4s, v24.4s, v3.s[3]\n"
"119:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 121f\n"
"120:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ "fmla v8.4s, v25.4s, v29.s[0]\n"
+ "fmla v12.4s, v25.4s, v28.s[0]\n"
+ "fmla v16.4s, v25.4s, v27.s[0]\n"
+ "fmla v20.4s, v25.4s, v26.s[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ "fmla v9.4s, v24.4s, v29.s[0]\n"
+ "fmla v13.4s, v24.4s, v28.s[0]\n"
+ "fmla v17.4s, v24.4s, v27.s[0]\n"
+ "fmla v21.4s, v24.4s, v26.s[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v10.4s, v25.4s, v29.s[0]\n"
+ "fmla v14.4s, v25.4s, v28.s[0]\n"
+ "fmla v18.4s, v25.4s, v27.s[0]\n"
+ "fmla v22.4s, v25.4s, v26.s[0]\n"
+ "fmla v11.4s, v24.4s, v29.s[0]\n"
+ "fmla v15.4s, v24.4s, v28.s[0]\n"
+ "fmla v19.4s, v24.4s, v27.s[0]\n"
+ "fmla v23.4s, v24.4s, v26.s[0]\n"
"cbnz x27, 120b\n"
"121:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1683,41 +1682,41 @@ void a64_hybrid_fp32_mla_6x16 (
"prfm pstl1keep, [x23, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v25.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v25.4s\n"
+ "fmin v9.4s, v9.4s, v25.4s\n"
+ "fmin v10.4s, v10.4s, v25.4s\n"
+ "fmin v11.4s, v11.4s, v25.4s\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmin v20.4s, v20.4s, v25.4s\n"
+ "fmin v21.4s, v21.4s, v25.4s\n"
+ "fmin v22.4s, v22.4s, v25.4s\n"
+ "fmin v23.4s, v23.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v24.4s\n"
+ "fmax v9.4s, v9.4s, v24.4s\n"
+ "fmax v10.4s, v10.4s, v24.4s\n"
+ "fmax v11.4s, v11.4s, v24.4s\n"
+ "fmax v12.4s, v12.4s, v24.4s\n"
+ "fmax v13.4s, v13.4s, v24.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "fmax v23.4s, v23.4s, v24.4s\n"
"122:" // Height 4: No activation
"cmp x11, #0x10\n"
"bge 131f\n"
@@ -2042,15 +2041,15 @@ void a64_hybrid_fp32_mla_6x16 (
"147:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 148f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 149f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -2061,10 +2060,10 @@ void a64_hybrid_fp32_mla_6x16 (
"b 149f\n"
"148:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"149:" // Height 5: input setup done
"cmp x27, #0x4\n"
"blt 152f\n"
@@ -2087,7 +2086,7 @@ void a64_hybrid_fp32_mla_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x23, x23, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2096,100 +2095,100 @@ void a64_hybrid_fp32_mla_6x16 (
"cmp x27, #0x8\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v18.4s, v29.4s, v2.s[0]\n"
+ "fmla v22.4s, v29.4s, v3.s[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v26.4s, v29.4s, v4.s[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v2.s[0]\n"
+ "fmla v23.4s, v28.4s, v3.s[0]\n"
+ "fmla v27.4s, v28.4s, v4.s[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ "fmla v8.4s, v29.4s, v0.s[1]\n"
+ "fmla v12.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[1]\n"
+ "fmla v20.4s, v29.4s, v3.s[1]\n"
+ "fmla v24.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ "fmla v9.4s, v28.4s, v0.s[1]\n"
+ "fmla v13.4s, v28.4s, v1.s[1]\n"
+ "fmla v17.4s, v28.4s, v2.s[1]\n"
+ "fmla v21.4s, v28.4s, v3.s[1]\n"
+ "fmla v25.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v14.4s, v29.4s, v1.s[1]\n"
+ "fmla v18.4s, v29.4s, v2.s[1]\n"
+ "fmla v22.4s, v29.4s, v3.s[1]\n"
+ "fmla v26.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[1]\n"
+ "fmla v19.4s, v28.4s, v2.s[1]\n"
+ "fmla v23.4s, v28.4s, v3.s[1]\n"
+ "fmla v27.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v12.4s, v29.4s, v1.s[2]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v20.4s, v29.4s, v3.s[2]\n"
+ "fmla v24.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v13.4s, v28.4s, v1.s[2]\n"
+ "fmla v17.4s, v28.4s, v2.s[2]\n"
+ "fmla v21.4s, v28.4s, v3.s[2]\n"
+ "fmla v25.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ "fmla v10.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v18.4s, v29.4s, v2.s[2]\n"
+ "fmla v22.4s, v29.4s, v3.s[2]\n"
+ "fmla v26.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[2]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v19.4s, v28.4s, v2.s[2]\n"
+ "fmla v23.4s, v28.4s, v3.s[2]\n"
+ "fmla v27.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ "fmla v8.4s, v29.4s, v0.s[3]\n"
+ "fmla v12.4s, v29.4s, v1.s[3]\n"
+ "fmla v16.4s, v29.4s, v2.s[3]\n"
+ "fmla v20.4s, v29.4s, v3.s[3]\n"
+ "fmla v24.4s, v29.4s, v4.s[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ "fmla v9.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[3]\n"
+ "fmla v17.4s, v28.4s, v2.s[3]\n"
+ "fmla v21.4s, v28.4s, v3.s[3]\n"
+ "fmla v25.4s, v28.4s, v4.s[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "fmla v18.4s, v29.4s, v2.s[3]\n"
+ "fmla v22.4s, v29.4s, v3.s[3]\n"
+ "fmla v26.4s, v29.4s, v4.s[3]\n"
"ldr q6, [x10, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
"ldr q0, [x26, #0x0]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v15.4s, v28.4s, v1.s[3]\n"
"ldr q1, [x25, #0x0]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v19.4s, v28.4s, v2.s[3]\n"
"ldr q2, [x24, #0x0]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v23.4s, v28.4s, v3.s[3]\n"
"ldr q3, [x23, #0x0]\n"
- "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v27.4s, v28.4s, v4.s[3]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 150b\n"
@@ -2203,7 +2202,7 @@ void a64_hybrid_fp32_mla_6x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"add x22, x22, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2212,128 +2211,128 @@ void a64_hybrid_fp32_mla_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v10.4s, v29.4s, v0.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v18.4s, v29.4s, v2.s[0]\n"
+ "fmla v22.4s, v29.4s, v3.s[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x40]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x10, #0x50]\n"
- "fmla v8.4s, v6.4s, v0.s[1]\n"
- "fmla v12.4s, v6.4s, v1.s[1]\n"
- "fmla v16.4s, v6.4s, v2.s[1]\n"
- "fmla v20.4s, v6.4s, v3.s[1]\n"
- "fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x10, #0x60]\n"
- "fmla v9.4s, v7.4s, v0.s[1]\n"
- "fmla v13.4s, v7.4s, v1.s[1]\n"
- "fmla v17.4s, v7.4s, v2.s[1]\n"
- "fmla v21.4s, v7.4s, v3.s[1]\n"
- "fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x10, #0x70]\n"
- "fmla v10.4s, v6.4s, v0.s[1]\n"
- "fmla v14.4s, v6.4s, v1.s[1]\n"
- "fmla v18.4s, v6.4s, v2.s[1]\n"
- "fmla v22.4s, v6.4s, v3.s[1]\n"
- "fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x10, #0x80]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v1.s[1]\n"
- "fmla v19.4s, v7.4s, v2.s[1]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x10, #0x90]\n"
- "fmla v8.4s, v6.4s, v0.s[2]\n"
- "fmla v12.4s, v6.4s, v1.s[2]\n"
- "fmla v16.4s, v6.4s, v2.s[2]\n"
- "fmla v20.4s, v6.4s, v3.s[2]\n"
- "fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[2]\n"
- "fmla v13.4s, v7.4s, v1.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[2]\n"
- "fmla v21.4s, v7.4s, v3.s[2]\n"
- "fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- "fmla v10.4s, v6.4s, v0.s[2]\n"
- "fmla v14.4s, v6.4s, v1.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[2]\n"
- "fmla v22.4s, v6.4s, v3.s[2]\n"
- "fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- "fmla v11.4s, v7.4s, v0.s[2]\n"
- "fmla v15.4s, v7.4s, v1.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[2]\n"
- "fmla v23.4s, v7.4s, v3.s[2]\n"
- "fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- "fmla v8.4s, v6.4s, v0.s[3]\n"
- "fmla v12.4s, v6.4s, v1.s[3]\n"
- "fmla v16.4s, v6.4s, v2.s[3]\n"
- "fmla v20.4s, v6.4s, v3.s[3]\n"
- "fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- "fmla v9.4s, v7.4s, v0.s[3]\n"
- "fmla v13.4s, v7.4s, v1.s[3]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v21.4s, v7.4s, v3.s[3]\n"
- "fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "fmla v26.4s, v29.4s, v4.s[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ "fmla v11.4s, v28.4s, v0.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v2.s[0]\n"
+ "fmla v23.4s, v28.4s, v3.s[0]\n"
+ "fmla v27.4s, v28.4s, v4.s[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ "fmla v8.4s, v29.4s, v0.s[1]\n"
+ "fmla v12.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[1]\n"
+ "fmla v20.4s, v29.4s, v3.s[1]\n"
+ "fmla v24.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ "fmla v9.4s, v28.4s, v0.s[1]\n"
+ "fmla v13.4s, v28.4s, v1.s[1]\n"
+ "fmla v17.4s, v28.4s, v2.s[1]\n"
+ "fmla v21.4s, v28.4s, v3.s[1]\n"
+ "fmla v25.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ "fmla v10.4s, v29.4s, v0.s[1]\n"
+ "fmla v14.4s, v29.4s, v1.s[1]\n"
+ "fmla v18.4s, v29.4s, v2.s[1]\n"
+ "fmla v22.4s, v29.4s, v3.s[1]\n"
+ "fmla v26.4s, v29.4s, v4.s[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ "fmla v11.4s, v28.4s, v0.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[1]\n"
+ "fmla v19.4s, v28.4s, v2.s[1]\n"
+ "fmla v23.4s, v28.4s, v3.s[1]\n"
+ "fmla v27.4s, v28.4s, v4.s[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ "fmla v8.4s, v29.4s, v0.s[2]\n"
+ "fmla v12.4s, v29.4s, v1.s[2]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v20.4s, v29.4s, v3.s[2]\n"
+ "fmla v24.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ "fmla v9.4s, v28.4s, v0.s[2]\n"
+ "fmla v13.4s, v28.4s, v1.s[2]\n"
+ "fmla v17.4s, v28.4s, v2.s[2]\n"
+ "fmla v21.4s, v28.4s, v3.s[2]\n"
+ "fmla v25.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ "fmla v10.4s, v29.4s, v0.s[2]\n"
+ "fmla v14.4s, v29.4s, v1.s[2]\n"
+ "fmla v18.4s, v29.4s, v2.s[2]\n"
+ "fmla v22.4s, v29.4s, v3.s[2]\n"
+ "fmla v26.4s, v29.4s, v4.s[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ "fmla v11.4s, v28.4s, v0.s[2]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v19.4s, v28.4s, v2.s[2]\n"
+ "fmla v23.4s, v28.4s, v3.s[2]\n"
+ "fmla v27.4s, v28.4s, v4.s[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ "fmla v8.4s, v29.4s, v0.s[3]\n"
+ "fmla v12.4s, v29.4s, v1.s[3]\n"
+ "fmla v16.4s, v29.4s, v2.s[3]\n"
+ "fmla v20.4s, v29.4s, v3.s[3]\n"
+ "fmla v24.4s, v29.4s, v4.s[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ "fmla v9.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[3]\n"
+ "fmla v17.4s, v28.4s, v2.s[3]\n"
+ "fmla v21.4s, v28.4s, v3.s[3]\n"
+ "fmla v25.4s, v28.4s, v4.s[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- "fmla v10.4s, v6.4s, v0.s[3]\n"
- "fmla v14.4s, v6.4s, v1.s[3]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v22.4s, v6.4s, v3.s[3]\n"
- "fmla v26.4s, v6.4s, v4.s[3]\n"
- "fmla v11.4s, v7.4s, v0.s[3]\n"
- "fmla v15.4s, v7.4s, v1.s[3]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v23.4s, v7.4s, v3.s[3]\n"
- "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v10.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[3]\n"
+ "fmla v18.4s, v29.4s, v2.s[3]\n"
+ "fmla v22.4s, v29.4s, v3.s[3]\n"
+ "fmla v26.4s, v29.4s, v4.s[3]\n"
+ "fmla v11.4s, v28.4s, v0.s[3]\n"
+ "fmla v15.4s, v28.4s, v1.s[3]\n"
+ "fmla v19.4s, v28.4s, v2.s[3]\n"
+ "fmla v23.4s, v28.4s, v3.s[3]\n"
+ "fmla v27.4s, v28.4s, v4.s[3]\n"
"152:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 154f\n"
"153:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s0, [x24], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q29, [x10, #0x0]\n"
+ "fmla v8.4s, v29.4s, v2.s[0]\n"
+ "fmla v12.4s, v29.4s, v1.s[0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ "fmla v16.4s, v29.4s, v0.s[0]\n"
+ "fmla v20.4s, v29.4s, v31.s[0]\n"
+ "fmla v24.4s, v29.4s, v30.s[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "fmla v9.4s, v28.4s, v2.s[0]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v17.4s, v28.4s, v0.s[0]\n"
+ "fmla v21.4s, v28.4s, v31.s[0]\n"
+ "fmla v25.4s, v28.4s, v30.s[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v10.4s, v29.4s, v2.s[0]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v18.4s, v29.4s, v0.s[0]\n"
+ "fmla v22.4s, v29.4s, v31.s[0]\n"
+ "fmla v26.4s, v29.4s, v30.s[0]\n"
+ "fmla v11.4s, v28.4s, v2.s[0]\n"
+ "fmla v15.4s, v28.4s, v1.s[0]\n"
+ "fmla v19.4s, v28.4s, v0.s[0]\n"
+ "fmla v23.4s, v28.4s, v31.s[0]\n"
+ "fmla v27.4s, v28.4s, v30.s[0]\n"
"cbnz x27, 153b\n"
"154:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2352,49 +2351,49 @@ void a64_hybrid_fp32_mla_6x16 (
"prfm pstl1keep, [x22, #0x0]\n"
"tbz %x[flags], #1, 155f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v29.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v23.4s, v23.4s, v1.4s\n"
- "fmin v24.4s, v24.4s, v1.4s\n"
- "fmin v25.4s, v25.4s, v1.4s\n"
- "fmin v26.4s, v26.4s, v1.4s\n"
- "fmin v27.4s, v27.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v0.4s\n"
- "fmax v26.4s, v26.4s, v0.4s\n"
- "fmax v27.4s, v27.4s, v0.4s\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "fmin v9.4s, v9.4s, v29.4s\n"
+ "fmin v10.4s, v10.4s, v29.4s\n"
+ "fmin v11.4s, v11.4s, v29.4s\n"
+ "fmin v12.4s, v12.4s, v29.4s\n"
+ "fmin v13.4s, v13.4s, v29.4s\n"
+ "fmin v14.4s, v14.4s, v29.4s\n"
+ "fmin v15.4s, v15.4s, v29.4s\n"
+ "fmin v16.4s, v16.4s, v29.4s\n"
+ "fmin v17.4s, v17.4s, v29.4s\n"
+ "fmin v18.4s, v18.4s, v29.4s\n"
+ "fmin v19.4s, v19.4s, v29.4s\n"
+ "fmin v20.4s, v20.4s, v29.4s\n"
+ "fmin v21.4s, v21.4s, v29.4s\n"
+ "fmin v22.4s, v22.4s, v29.4s\n"
+ "fmin v23.4s, v23.4s, v29.4s\n"
+ "fmin v24.4s, v24.4s, v29.4s\n"
+ "fmin v25.4s, v25.4s, v29.4s\n"
+ "fmin v26.4s, v26.4s, v29.4s\n"
+ "fmin v27.4s, v27.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v28.4s\n"
+ "fmax v9.4s, v9.4s, v28.4s\n"
+ "fmax v10.4s, v10.4s, v28.4s\n"
+ "fmax v11.4s, v11.4s, v28.4s\n"
+ "fmax v12.4s, v12.4s, v28.4s\n"
+ "fmax v13.4s, v13.4s, v28.4s\n"
+ "fmax v14.4s, v14.4s, v28.4s\n"
+ "fmax v15.4s, v15.4s, v28.4s\n"
+ "fmax v16.4s, v16.4s, v28.4s\n"
+ "fmax v17.4s, v17.4s, v28.4s\n"
+ "fmax v18.4s, v18.4s, v28.4s\n"
+ "fmax v19.4s, v19.4s, v28.4s\n"
+ "fmax v20.4s, v20.4s, v28.4s\n"
+ "fmax v21.4s, v21.4s, v28.4s\n"
+ "fmax v22.4s, v22.4s, v28.4s\n"
+ "fmax v23.4s, v23.4s, v28.4s\n"
+ "fmax v24.4s, v24.4s, v28.4s\n"
+ "fmax v25.4s, v25.4s, v28.4s\n"
+ "fmax v26.4s, v26.4s, v28.4s\n"
+ "fmax v27.4s, v27.4s, v28.4s\n"
"155:" // Height 5: No activation
"cmp x11, #0x10\n"
"bge 164f\n"
@@ -2771,16 +2770,16 @@ void a64_hybrid_fp32_mla_6x16 (
"180:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 181f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 182f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -2792,11 +2791,11 @@ void a64_hybrid_fp32_mla_6x16 (
"b 182f\n"
"181:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"182:" // Height 6: input setup done
"cmp x27, #0x4\n"
"blt 185f\n"
@@ -3073,42 +3072,42 @@ void a64_hybrid_fp32_mla_6x16 (
"185:" // Height 6: Multiply loop: Main loop skip
"cbz x27, 187f\n"
"186:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x1\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v1.s[0]\n"
- "fmla v16.4s, v6.4s, v2.s[0]\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "fmla v28.4s, v6.4s, v5.s[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
- "fmla v21.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v4.s[0]\n"
- "fmla v29.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "fmla v8.4s, v1.4s, v7.s[0]\n"
+ "fmla v12.4s, v1.4s, v6.s[0]\n"
+ "fmla v16.4s, v1.4s, v5.s[0]\n"
+ "fmla v20.4s, v1.4s, v4.s[0]\n"
+ "fmla v24.4s, v1.4s, v3.s[0]\n"
+ "fmla v28.4s, v1.4s, v2.s[0]\n"
+ "ldr q1, [x10, #0x20]\n"
+ "fmla v9.4s, v0.4s, v7.s[0]\n"
+ "fmla v13.4s, v0.4s, v6.s[0]\n"
+ "fmla v17.4s, v0.4s, v5.s[0]\n"
+ "fmla v21.4s, v0.4s, v4.s[0]\n"
+ "fmla v25.4s, v0.4s, v3.s[0]\n"
+ "fmla v29.4s, v0.4s, v2.s[0]\n"
+ "ldr q0, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v14.4s, v6.4s, v1.s[0]\n"
- "fmla v18.4s, v6.4s, v2.s[0]\n"
- "fmla v22.4s, v6.4s, v3.s[0]\n"
- "fmla v26.4s, v6.4s, v4.s[0]\n"
- "fmla v30.4s, v6.4s, v5.s[0]\n"
- "fmla v11.4s, v7.4s, v0.s[0]\n"
- "fmla v15.4s, v7.4s, v1.s[0]\n"
- "fmla v19.4s, v7.4s, v2.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[0]\n"
- "fmla v27.4s, v7.4s, v4.s[0]\n"
- "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "fmla v10.4s, v1.4s, v7.s[0]\n"
+ "fmla v14.4s, v1.4s, v6.s[0]\n"
+ "fmla v18.4s, v1.4s, v5.s[0]\n"
+ "fmla v22.4s, v1.4s, v4.s[0]\n"
+ "fmla v26.4s, v1.4s, v3.s[0]\n"
+ "fmla v30.4s, v1.4s, v2.s[0]\n"
+ "fmla v11.4s, v0.4s, v7.s[0]\n"
+ "fmla v15.4s, v0.4s, v6.s[0]\n"
+ "fmla v19.4s, v0.4s, v5.s[0]\n"
+ "fmla v23.4s, v0.4s, v4.s[0]\n"
+ "fmla v27.4s, v0.4s, v3.s[0]\n"
+ "fmla v31.4s, v0.4s, v2.s[0]\n"
"cbnz x27, 186b\n"
"187:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3350,7 +3349,6 @@ void a64_hybrid_fp32_mla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"200:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
index 4fad58a83d..3ec02395d1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#define ARGLIST \
@@ -90,5 +90,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
index 67e0c1e8cc..236865315e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021, 2023 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_8x4_a55 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x8\n"
"bge 148f\n"
@@ -105,563 +104,563 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"cmp %x[M], #0x2\n"
"bgt 43f\n"
"beq 22f\n"
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x15, %x[bias]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"2:" // Height 1: Column loop
- "cbz x15, 3f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 3f\n"
+ "ldr q24, [x3, #0x0]\n"
+ "add x3, x3, #0x10\n"
"b 8f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 7f\n"
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 6f\n"
- "tbz x17, #1, 4f\n"
- "ldr d24, [x14], #0x8\n"
- "mov x8, #0x8\n"
- "tbz x17, #0, 5f\n"
- "ld1 { v24.s }[2], [x14]\n"
+ "tbz x4, #1, 4f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "tbz x4, #0, 5f\n"
+ "ld1 { v24.s }[2], [x6]\n"
"b 5f\n"
"4:" // Height 1: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
"5:" // Height 1: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 8f\n"
"6:" // Height 1: full accumulate
- "ldr q24, [x14, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
"b 8f\n"
"7:" // Height 1: no accumulate
"movi v24.16b, #0x0\n"
"8:" // Height 1: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"9:" // Height 1: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 10f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "cbnz x13, 11f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "cbnz x7, 11f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
"b 11f\n"
"10:" // Height 1: setup direct input
- "mov x11, %x[input_ptr]\n"
+ "mov x17, %x[input_ptr]\n"
"11:" // Height 1: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 14f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
- "cmp x12, #0x8\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 13f\n"
"12:" // Height 1: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
- "ldr x8, [x16, #0x18]\n"
- "add x11, x11, #0x10\n"
- "ldr d10, [x16, #0x20]\n"
- "sub x12, x12, #0x4\n"
- "ldr x21, [x16, #0x28]\n"
- "cmp x12, #0x8\n"
- "mov v9.d[1], x8\n"
- "ldr d11, [x16, #0x30]\n"
- "ldr x8, [x16, #0x38]\n"
- "add x16, x16, #0x40\n"
+ "add x17, x17, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "mov v10.d[1], x21\n"
- "prfm pldl1keep, [x11, #0x80]\n"
- "mov v11.d[1], x8\n"
- "ldr d8, [x16, #0x0]\n"
- "ldr x26, [x16, #0x8]\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x10, [x11, #0x8]\n"
- "mov v8.d[1], x26\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
- "mov v0.d[1], x10\n"
+ "ldr d0, [x17, #0x0]\n"
+ "sub x8, x8, #0x4\n"
+ "ldr d10, [x5, #0x20]\n"
+ "cmp x8, #0x8\n"
+ "ldr d11, [x5, #0x30]\n"
+ "ldr x26, [x5, #0x8]\n"
+ "mov v8.d[1], x26\n"
+ "ldr x26, [x5, #0x18]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x17, #0x8]\n"
+ "mov v0.d[1], x26\n"
+ "ldr x26, [x5, #0x28]\n"
+ "mov v10.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"bge 12b\n"
"13:" // Height 1: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
- "ldr q10, [x16, #0x20]\n"
- "sub x12, x12, #0x4\n"
- "ldr q11, [x16, #0x30]\n"
- "add x11, x11, #0x10\n"
- "prfm pldl1keep, [x11, #0x80]\n"
- "add x16, x16, #0x40\n"
+ "add x17, x17, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
+ "sub x8, x8, #0x4\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
+ "add x5, x5, #0x40\n"
"14:" // Height 1: Multiply loop: Main loop skip
- "cbz x12, 16f\n"
+ "cbz x8, 16f\n"
"15:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "cbnz x12, 15b\n"
+ "ldr s17, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v17.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "cbnz x8, 15b\n"
"16:" // Height 1: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 9b\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
"tbz %x[flags], #1, 17f\n"
- "add x8, %x[args_ptr], %[offset_min]\n"
- "ld1r { v17.4s }, [x8]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
"17:" // Height 1: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 20f\n"
- "tbz x17, #1, 18f\n"
- "str d24, [x14], #0x8\n"
- "tbz x17, #0, 19f\n"
- "st1 { v24.s }[2], [x14]\n"
+ "tbz x4, #1, 18f\n"
+ "str d24, [x6], #0x8\n"
+ "tbz x4, #0, 19f\n"
+ "st1 { v24.s }[2], [x6]\n"
"b 19f\n"
"18:" // Height 1: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
"19:" // Height 1: Partial direct writeback: Done
"b 21f\n"
"20:" // Height 1: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
"21:" // Height 1: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 2b\n"
"b 170f\n"
"22:" // Height 2
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"23:" // Height 2: Column loop
- "cbz x15, 24f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 24f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"b 29f\n"
"24:" // Height 2: no bias
"tbz %x[flags], #0, 28f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x4, #0x4\n"
+ "add x13, x6, x26, LSL #2\n"
"bge 27f\n"
- "tbz x17, #1, 25f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "tbz x17, #0, 26f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
+ "tbz x4, #1, 25f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "tbz x4, #0, 26f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
"b 26f\n"
"25:" // Height 2: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
"26:" // Height 2: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 29f\n"
"27:" // Height 2: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
"b 29f\n"
"28:" // Height 2: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"29:" // Height 2: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"30:" // Height 2: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "cbnz x13, 32f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "cbnz x7, 32f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
"b 32f\n"
"31:" // Height 2: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
"32:" // Height 2: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 35f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 34f\n"
"33:" // Height 2: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
- "ldr d10, [x16, #0x20]\n"
- "add x11, x11, #0x10\n"
- "ldr x21, [x16, #0x28]\n"
- "add x9, x9, #0x10\n"
- "mov v9.d[1], x8\n"
- "ldr d11, [x16, #0x30]\n"
- "ldr x8, [x16, #0x38]\n"
- "sub x12, x12, #0x4\n"
+ "add x16, x16, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "mov v10.d[1], x21\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
- "mov v11.d[1], x8\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "ldr x10, [x11, #0x8]\n"
- "cmp x12, #0x8\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x28, [x9, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "add x16, x16, #0x40\n"
- "ldr d8, [x16, #0x0]\n"
- "ldr x26, [x16, #0x8]\n"
+ "ldr d10, [x5, #0x20]\n"
+ "ldr x27, [x5, #0x8]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "mov v8.d[1], x26\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
+ "ldr d1, [x16, #0x0]\n"
+ "sub x8, x8, #0x4\n"
+ "ldr d11, [x5, #0x30]\n"
+ "cmp x8, #0x8\n"
+ "ldr x26, [x5, #0x18]\n"
+ "mov v8.d[1], x27\n"
+ "ldr x27, [x17, #0x8]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "ldr x27, [x5, #0x28]\n"
+ "mov v1.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v10.d[1], x27\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"bge 33b\n"
"34:" // Height 2: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
- "ldr q11, [x16, #0x30]\n"
- "sub x12, x12, #0x4\n"
- "add x11, x11, #0x10\n"
- "add x9, x9, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "add x16, x16, #0x40\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "add x5, x5, #0x40\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"35:" // Height 2: Multiply loop: Main loop skip
- "cbz x12, 37f\n"
+ "cbz x8, 37f\n"
"36:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "cbnz x12, 36b\n"
+ "ldr s18, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s17, [x16], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v18.s[0]\n"
+ "fmla v25.4s, v16.4s, v17.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "cbnz x8, 36b\n"
"37:" // Height 2: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 30b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
"tbz %x[flags], #1, 38f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
"38:" // Height 2: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 41f\n"
- "tbz x17, #1, 39f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "tbz x17, #0, 40f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
+ "tbz x4, #1, 39f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "tbz x4, #0, 40f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
"b 40f\n"
"39:" // Height 2: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
"40:" // Height 2: Partial direct writeback: Done
"b 42f\n"
"41:" // Height 2: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
"42:" // Height 2: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 23b\n"
"b 170f\n"
"43:" // Height 3
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"44:" // Height 3: Column loop
- "cbz x15, 45f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 45f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"b 50f\n"
"45:" // Height 3: no bias
"tbz %x[flags], #0, 49f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
- "add x26, x27, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "cmp x4, #0x4\n"
+ "add x12, x13, x26, LSL #2\n"
"bge 48f\n"
- "tbz x17, #1, 46f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "ldr d26, [x26], #0x8\n"
- "tbz x17, #0, 47f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
- "ld1 { v26.s }[2], [x26]\n"
+ "tbz x4, #1, 46f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
"b 47f\n"
"46:" // Height 3: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
- "ldr s26, [x26, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
"47:" // Height 3: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 50f\n"
"48:" // Height 3: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
- "ldr q26, [x26, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
"b 50f\n"
"49:" // Height 3: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"50:" // Height 3: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"51:" // Height 3: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 52f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x27, [x20, #0x10]\n"
- "cbnz x13, 53f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
- "add x27, x27, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "cbnz x7, 53f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
"b 53f\n"
"52:" // Height 3: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
- "add x27, x9, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
"53:" // Height 3: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 56f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q2, [x15, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 55f\n"
"54:" // Height 3: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr d10, [x16, #0x20]\n"
- "ldr x21, [x16, #0x28]\n"
- "add x11, x11, #0x10\n"
- "mov v9.d[1], x8\n"
- "ldr d11, [x16, #0x30]\n"
- "ldr x8, [x16, #0x38]\n"
- "add x9, x9, #0x10\n"
+ "add x15, x15, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "mov v10.d[1], x21\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "mov v11.d[1], x8\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "add x27, x27, #0x10\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "ldr x28, [x5, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x10, [x11, #0x8]\n"
+ "ldr x27, [x5, #0x18]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x28, [x9, #0x8]\n"
- "ldr x26, [x27, #0x8]\n"
- "sub x12, x12, #0x4\n"
+ "ldr d10, [x5, #0x20]\n"
+ "ldr x26, [x5, #0x28]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x16, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
- "mov v0.d[1], x10\n"
- "cmp x12, #0x8\n"
+ "ldr d2, [x15, #0x0]\n"
+ "sub x8, x8, #0x4\n"
+ "ldr d11, [x5, #0x30]\n"
+ "cmp x8, #0x8\n"
+ "ldr x9, [x17, #0x8]\n"
+ "mov v8.d[1], x28\n"
+ "ldr x28, [x16, #0x8]\n"
+ "mov v9.d[1], x27\n"
+ "ldr x27, [x15, #0x8]\n"
+ "mov v10.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v0.d[1], x9\n"
"mov v1.d[1], x28\n"
- "add x16, x16, #0x40\n"
- "mov v2.d[1], x26\n"
- "ldr d8, [x16, #0x0]\n"
- "ldr x26, [x16, #0x8]\n"
- "mov v8.d[1], x26\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
+ "mov v2.d[1], x27\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"bge 54b\n"
"55:" // Height 3: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x16, #0x30]\n"
- "sub x12, x12, #0x4\n"
- "add x11, x11, #0x10\n"
+ "add x15, x15, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "add x27, x27, #0x10\n"
- "add x16, x16, #0x40\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"56:" // Height 3: Multiply loop: Main loop skip
- "cbz x12, 58f\n"
+ "cbz x8, 58f\n"
"57:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "cbnz x12, 57b\n"
+ "ldr s19, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s18, [x16], #0x4\n"
+ "ldr s17, [x15], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v19.s[0]\n"
+ "fmla v25.4s, v16.4s, v18.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v26.4s, v16.4s, v17.s[0]\n"
+ "cbnz x8, 57b\n"
"58:" // Height 3: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 51b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x8, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
"tbz %x[flags], #1, 59f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
"fmin v26.4s, v26.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
- "fmax v26.4s, v26.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v16.4s\n"
"59:" // Height 3: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 62f\n"
- "tbz x17, #1, 60f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "str d26, [x26], #0x8\n"
- "tbz x17, #0, 61f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
- "st1 { v26.s }[2], [x26]\n"
+ "tbz x4, #1, 60f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "tbz x4, #0, 61f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
"b 61f\n"
"60:" // Height 3: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
- "str s26, [x26, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
"61:" // Height 3: Partial direct writeback: Done
"b 63f\n"
"62:" // Height 3: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
- "str q26, [x26, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
"63:" // Height 3: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 44b\n"
"b 170f\n"
"64:" // Height 4
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"65:" // Height 4: Column loop
- "cbz x15, 66f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 66f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"b 71f\n"
"66:" // Height 4: no bias
"tbz %x[flags], #0, 70f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
- "add x26, x27, x8, LSL #2\n"
- "add x25, x26, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "cmp x4, #0x4\n"
+ "add x11, x12, x26, LSL #2\n"
"bge 69f\n"
- "tbz x17, #1, 67f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "ldr d26, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "tbz x17, #0, 68f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
- "ld1 { v26.s }[2], [x26]\n"
- "ld1 { v27.s }[2], [x25]\n"
+ "tbz x4, #1, 67f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "tbz x4, #0, 68f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
"b 68f\n"
"67:" // Height 4: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
- "ldr s26, [x26, #0x0]\n"
- "ldr s27, [x25, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
"68:" // Height 4: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 71f\n"
"69:" // Height 4: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
- "ldr q26, [x26, #0x0]\n"
- "ldr q27, [x25, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
"b 71f\n"
"70:" // Height 4: no accumulate
"movi v24.16b, #0x0\n"
@@ -669,248 +668,248 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
"71:" // Height 4: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"72:" // Height 4: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x27, [x20, #0x10]\n"
- "ldr x25, [x20, #0x18]\n"
- "cbnz x13, 74f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
- "add x27, x27, x8, LSL #2\n"
- "add x25, x25, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "cbnz x7, 74f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
"b 74f\n"
"73:" // Height 4: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
- "add x27, x9, x8, LSL #2\n"
- "add x25, x27, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
"74:" // Height 4: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 77f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q2, [x15, #0x0]\n"
+ "ldr q3, [x14, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 76f\n"
"75:" // Height 4: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr d10, [x16, #0x20]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "ldr x21, [x16, #0x28]\n"
- "mov v9.d[1], x8\n"
- "ldr d11, [x16, #0x30]\n"
- "ldr x8, [x16, #0x38]\n"
- "add x11, x11, #0x10\n"
+ "add x14, x14, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "mov v10.d[1], x21\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "mov v11.d[1], x8\n"
+ "ldr x27, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "ldr x10, [x11, #0x8]\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x9, x9, #0x10\n"
+ "ldr x26, [x5, #0x18]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x11, [x5, #0x28]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x10, [x17, #0x8]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "add x27, x27, #0x10\n"
+ "ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d1, [x16, #0x0]\n"
+ "ldr x9, [x16, #0x8]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d3, [x14, #0x0]\n"
+ "sub x8, x8, #0x4\n"
+ "ldr d11, [x5, #0x30]\n"
+ "cmp x8, #0x8\n"
+ "ldr x28, [x15, #0x8]\n"
+ "mov v8.d[1], x27\n"
+ "ldr x27, [x14, #0x8]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v10.d[1], x11\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"mov v0.d[1], x10\n"
- "ldr x26, [x27, #0x8]\n"
- "mov v1.d[1], x28\n"
- "add x25, x25, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "sub x12, x12, #0x4\n"
- "mov v2.d[1], x26\n"
- "ldr d3, [x25, #0x0]\n"
- "ldr x8, [x25, #0x8]\n"
- "cmp x12, #0x8\n"
- "add x16, x16, #0x40\n"
- "ldr d8, [x16, #0x0]\n"
- "mov v3.d[1], x8\n"
- "ldr x26, [x16, #0x8]\n"
- "mov v8.d[1], x26\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
+ "mov v1.d[1], x9\n"
+ "mov v2.d[1], x28\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
+ "mov v3.d[1], x27\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "mov v11.d[1], x26\n"
"bge 75b\n"
"76:" // Height 4: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x16, #0x30]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "sub x12, x12, #0x4\n"
+ "add x14, x14, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "add x11, x11, #0x10\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x27, x27, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "add x25, x25, #0x10\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "add x16, x16, #0x40\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
"77:" // Height 4: Multiply loop: Main loop skip
- "cbz x12, 79f\n"
+ "cbz x8, 79f\n"
"78:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "cbnz x12, 78b\n"
+ "ldr s20, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s19, [x16], #0x4\n"
+ "ldr s18, [x15], #0x4\n"
+ "ldr s17, [x14], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v20.s[0]\n"
+ "fmla v25.4s, v16.4s, v19.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v26.4s, v16.4s, v18.s[0]\n"
+ "fmla v27.4s, v16.4s, v17.s[0]\n"
+ "cbnz x8, 78b\n"
"79:" // Height 4: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 72b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x8, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x8, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
"tbz %x[flags], #1, 80f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
"fmin v26.4s, v26.4s, v16.4s\n"
"fmin v27.4s, v27.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
- "fmax v26.4s, v26.4s, v17.4s\n"
- "fmax v27.4s, v27.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v16.4s\n"
+ "fmax v27.4s, v27.4s, v16.4s\n"
"80:" // Height 4: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 83f\n"
- "tbz x17, #1, 81f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "str d26, [x26], #0x8\n"
- "str d27, [x25], #0x8\n"
- "tbz x17, #0, 82f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
- "st1 { v26.s }[2], [x26]\n"
- "st1 { v27.s }[2], [x25]\n"
+ "tbz x4, #1, 81f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "tbz x4, #0, 82f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
"b 82f\n"
"81:" // Height 4: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
- "str s26, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
"82:" // Height 4: Partial direct writeback: Done
"b 84f\n"
"83:" // Height 4: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
- "str q26, [x26, #0x0]\n"
- "str q27, [x25, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
"84:" // Height 4: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 65b\n"
"b 170f\n"
"85:" // Height 5
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"86:" // Height 5: Column loop
- "cbz x15, 87f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 87f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"b 92f\n"
"87:" // Height 5: no bias
"tbz %x[flags], #0, 91f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
- "add x26, x27, x8, LSL #2\n"
- "add x25, x26, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "cmp x4, #0x4\n"
+ "add x10, x11, x26, LSL #2\n"
"bge 90f\n"
- "tbz x17, #1, 88f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "ldr d26, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "ldr d28, [x24], #0x8\n"
- "tbz x17, #0, 89f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
- "ld1 { v26.s }[2], [x26]\n"
- "ld1 { v27.s }[2], [x25]\n"
- "ld1 { v28.s }[2], [x24]\n"
+ "tbz x4, #1, 88f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "tbz x4, #0, 89f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
"b 89f\n"
"88:" // Height 5: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
- "ldr s26, [x26, #0x0]\n"
- "ldr s27, [x25, #0x0]\n"
- "ldr s28, [x24, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
"89:" // Height 5: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 92f\n"
"90:" // Height 5: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
- "ldr q26, [x26, #0x0]\n"
- "ldr q27, [x25, #0x0]\n"
- "ldr q28, [x24, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
"b 92f\n"
"91:" // Height 5: no accumulate
"movi v24.16b, #0x0\n"
@@ -919,283 +918,283 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"movi v27.16b, #0x0\n"
"movi v28.16b, #0x0\n"
"92:" // Height 5: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"93:" // Height 5: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 94f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x27, [x20, #0x10]\n"
- "ldr x25, [x20, #0x18]\n"
- "ldr x24, [x20, #0x20]\n"
- "cbnz x13, 95f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
- "add x27, x27, x8, LSL #2\n"
- "add x25, x25, x8, LSL #2\n"
- "add x24, x24, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "cbnz x7, 95f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
"b 95f\n"
"94:" // Height 5: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
- "add x27, x9, x8, LSL #2\n"
- "add x25, x27, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
"95:" // Height 5: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 98f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q2, [x15, #0x0]\n"
+ "ldr q3, [x14, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 97f\n"
"96:" // Height 5: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr d10, [x16, #0x20]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "mov v9.d[1], x8\n"
- "ldr d11, [x16, #0x30]\n"
- "add x11, x11, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "mov v10.d[1], x21\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "ldr x8, [x16, #0x38]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x27, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "ldr x10, [x11, #0x8]\n"
+ "ldr x26, [x5, #0x18]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "mov v11.d[1], x8\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x9, x9, #0x10\n"
+ "ldr x12, [x5, #0x28]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x11, [x17, #0x8]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x10, [x16, #0x8]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "add x27, x27, #0x10\n"
+ "ldr x9, [x15, #0x8]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x16, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
- "mov v0.d[1], x10\n"
+ "ldr d3, [x14, #0x0]\n"
+ "ldr x28, [x14, #0x8]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
- "mov v1.d[1], x28\n"
- "ldr x26, [x27, #0x8]\n"
- "add x25, x25, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x12, x12, #0x4\n"
- "mov v2.d[1], x26\n"
- "ldr d3, [x25, #0x0]\n"
- "ldr x8, [x25, #0x8]\n"
- "cmp x12, #0x8\n"
- "ldr d4, [x24, #0x0]\n"
- "add x16, x16, #0x40\n"
- "ldr x21, [x24, #0x8]\n"
- "mov v3.d[1], x8\n"
- "ldr d8, [x16, #0x0]\n"
- "ldr x26, [x16, #0x8]\n"
- "mov v4.d[1], x21\n"
- "mov v8.d[1], x26\n"
+ "ldr d4, [x13, #0x0]\n"
+ "sub x8, x8, #0x4\n"
+ "ldr d11, [x5, #0x30]\n"
+ "cmp x8, #0x8\n"
+ "mov v8.d[1], x27\n"
+ "ldr x27, [x13, #0x8]\n"
+ "mov v9.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
+ "mov v10.d[1], x12\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
+ "mov v0.d[1], x11\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
+ "mov v1.d[1], x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "mov v2.d[1], x9\n"
+ "mov v3.d[1], x28\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v4.d[1], x27\n"
+ "mov v11.d[1], x26\n"
"bge 96b\n"
"97:" // Height 5: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x16, #0x30]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "sub x12, x12, #0x4\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x11, x11, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "add x27, x27, #0x10\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "add x24, x24, #0x10\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "add x16, x16, #0x40\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
"98:" // Height 5: Multiply loop: Main loop skip
- "cbz x12, 100f\n"
+ "cbz x8, 100f\n"
"99:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "cbnz x12, 99b\n"
+ "ldr s21, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s20, [x16], #0x4\n"
+ "ldr s19, [x15], #0x4\n"
+ "ldr s18, [x14], #0x4\n"
+ "ldr s17, [x13], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v21.s[0]\n"
+ "fmla v25.4s, v16.4s, v20.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v26.4s, v16.4s, v19.s[0]\n"
+ "fmla v27.4s, v16.4s, v18.s[0]\n"
+ "fmla v28.4s, v16.4s, v17.s[0]\n"
+ "cbnz x8, 99b\n"
"100:" // Height 5: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 93b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x8, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x8, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x8, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
"tbz %x[flags], #1, 101f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
"fmin v26.4s, v26.4s, v16.4s\n"
"fmin v27.4s, v27.4s, v16.4s\n"
"fmin v28.4s, v28.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
- "fmax v26.4s, v26.4s, v17.4s\n"
- "fmax v27.4s, v27.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v16.4s\n"
+ "fmax v27.4s, v27.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v16.4s\n"
"101:" // Height 5: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 104f\n"
- "tbz x17, #1, 102f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "str d26, [x26], #0x8\n"
- "str d27, [x25], #0x8\n"
- "str d28, [x24], #0x8\n"
- "tbz x17, #0, 103f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
- "st1 { v26.s }[2], [x26]\n"
- "st1 { v27.s }[2], [x25]\n"
- "st1 { v28.s }[2], [x24]\n"
+ "tbz x4, #1, 102f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "tbz x4, #0, 103f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
"b 103f\n"
"102:" // Height 5: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
- "str s26, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
- "str s28, [x24, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
"103:" // Height 5: Partial direct writeback: Done
"b 105f\n"
"104:" // Height 5: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
- "str q26, [x26, #0x0]\n"
- "str q27, [x25, #0x0]\n"
- "str q28, [x24, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
"105:" // Height 5: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 86b\n"
"b 170f\n"
"106:" // Height 6
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"107:" // Height 6: Column loop
- "cbz x15, 108f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 108f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
"b 113f\n"
"108:" // Height 6: no bias
"tbz %x[flags], #0, 112f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
- "add x26, x27, x8, LSL #2\n"
- "add x25, x26, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
- "add x23, x24, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "cmp x4, #0x4\n"
+ "add x9, x10, x26, LSL #2\n"
"bge 111f\n"
- "tbz x17, #1, 109f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "ldr d26, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "ldr d28, [x24], #0x8\n"
- "ldr d29, [x23], #0x8\n"
- "tbz x17, #0, 110f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
- "ld1 { v26.s }[2], [x26]\n"
- "ld1 { v27.s }[2], [x25]\n"
- "ld1 { v28.s }[2], [x24]\n"
- "ld1 { v29.s }[2], [x23]\n"
+ "tbz x4, #1, 109f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "ldr d29, [x9], #0x8\n"
+ "tbz x4, #0, 110f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x9]\n"
"b 110f\n"
"109:" // Height 6: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
- "ldr s26, [x26, #0x0]\n"
- "ldr s27, [x25, #0x0]\n"
- "ldr s28, [x24, #0x0]\n"
- "ldr s29, [x23, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
+ "ldr s29, [x9, #0x0]\n"
"110:" // Height 6: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 113f\n"
"111:" // Height 6: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
- "ldr q26, [x26, #0x0]\n"
- "ldr q27, [x25, #0x0]\n"
- "ldr q28, [x24, #0x0]\n"
- "ldr q29, [x23, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q29, [x9, #0x0]\n"
"b 113f\n"
"112:" // Height 6: no accumulate
"movi v24.16b, #0x0\n"
@@ -1205,154 +1204,154 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"movi v28.16b, #0x0\n"
"movi v29.16b, #0x0\n"
"113:" // Height 6: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"114:" // Height 6: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x27, [x20, #0x10]\n"
- "ldr x25, [x20, #0x18]\n"
- "ldr x24, [x20, #0x20]\n"
- "ldr x23, [x20, #0x28]\n"
- "cbnz x13, 116f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
- "add x27, x27, x8, LSL #2\n"
- "add x25, x25, x8, LSL #2\n"
- "add x24, x24, x8, LSL #2\n"
- "add x23, x23, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "ldr x12, [x26, #0x28]\n"
+ "cbnz x7, 116f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
+ "add x12, x12, x26, LSL #2\n"
"b 116f\n"
"115:" // Height 6: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
- "add x27, x9, x8, LSL #2\n"
- "add x25, x27, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
- "add x23, x24, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
"116:" // Height 6: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 119f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q5, [x23, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q2, [x15, #0x0]\n"
+ "ldr q3, [x14, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q5, [x12, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 118f\n"
"117:" // Height 6: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr d10, [x16, #0x20]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "mov v9.d[1], x8\n"
+ "add x13, x13, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "ldr d11, [x16, #0x30]\n"
+ "add x12, x12, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "mov v10.d[1], x21\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "ldr x8, [x16, #0x38]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "add x11, x11, #0x10\n"
+ "ldr x9, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x28, [x5, #0x18]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "mov v11.d[1], x8\n"
+ "ldr x27, [x5, #0x28]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "ldr x10, [x11, #0x8]\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x9, x9, #0x10\n"
+ "ldr x26, [x17, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x11, [x16, #0x8]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x10, [x15, #0x8]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "add x27, x27, #0x10\n"
+ "sub x8, x8, #0x4\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "cmp x8, #0x8\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x16, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
- "mov v0.d[1], x10\n"
+ "ldr d3, [x14, #0x0]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
- "mov v1.d[1], x28\n"
+ "ldr d4, [x13, #0x0]\n"
"fmla v29.4s, v11.4s, v5.s[3]\n"
- "mov v2.d[1], x26\n"
- "add x25, x25, #0x10\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x12, x12, #0x4\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "cmp x12, #0x8\n"
- "ldr d3, [x25, #0x0]\n"
- "add x16, x16, #0x40\n"
- "ldr x8, [x25, #0x8]\n"
- "ldr d4, [x24, #0x0]\n"
- "ldr x21, [x24, #0x8]\n"
- "mov v3.d[1], x8\n"
- "ldr d5, [x23, #0x0]\n"
- "ldr x8, [x23, #0x8]\n"
- "mov v4.d[1], x21\n"
- "ldr d8, [x16, #0x0]\n"
- "ldr x26, [x16, #0x8]\n"
- "mov v5.d[1], x8\n"
- "mov v8.d[1], x26\n"
+ "ldr d5, [x12, #0x0]\n"
+ "ldr d11, [x5, #0x30]\n"
+ "mov v8.d[1], x9\n"
+ "ldr x9, [x14, #0x8]\n"
+ "mov v9.d[1], x28\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v10.d[1], x27\n"
+ "ldr x27, [x12, #0x8]\n"
+ "mov v0.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v1.d[1], x11\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
+ "mov v2.d[1], x10\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
+ "mov v3.d[1], x9\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
+ "mov v4.d[1], x28\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "mov v5.d[1], x27\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 117b\n"
"118:" // Height 6: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x16, #0x30]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "sub x12, x12, #0x4\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x11, x11, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "add x12, x12, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "add x9, x9, #0x10\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "add x27, x27, #0x10\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "add x23, x23, #0x10\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "add x16, x16, #0x40\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -1361,108 +1360,108 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v28.4s, v11.4s, v4.s[3]\n"
"fmla v29.4s, v11.4s, v5.s[3]\n"
"119:" // Height 6: Multiply loop: Main loop skip
- "cbz x12, 121f\n"
+ "cbz x8, 121f\n"
"120:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s5, [x23], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "fmla v29.4s, v12.4s, v5.s[0]\n"
- "cbnz x12, 120b\n"
+ "ldr s22, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s21, [x16], #0x4\n"
+ "ldr s20, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s18, [x13], #0x4\n"
+ "ldr s17, [x12], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v22.s[0]\n"
+ "fmla v25.4s, v16.4s, v21.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v26.4s, v16.4s, v20.s[0]\n"
+ "fmla v27.4s, v16.4s, v19.s[0]\n"
+ "fmla v28.4s, v16.4s, v18.s[0]\n"
+ "fmla v29.4s, v16.4s, v17.s[0]\n"
+ "cbnz x8, 120b\n"
"121:" // Height 6: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 114b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x8, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x8, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x8, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x8, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
"tbz %x[flags], #1, 122f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
"fmin v26.4s, v26.4s, v16.4s\n"
"fmin v27.4s, v27.4s, v16.4s\n"
"fmin v28.4s, v28.4s, v16.4s\n"
"fmin v29.4s, v29.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
- "fmax v26.4s, v26.4s, v17.4s\n"
- "fmax v27.4s, v27.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v16.4s\n"
+ "fmax v27.4s, v27.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v16.4s\n"
"122:" // Height 6: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 125f\n"
- "tbz x17, #1, 123f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "str d26, [x26], #0x8\n"
- "str d27, [x25], #0x8\n"
- "str d28, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "tbz x17, #0, 124f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
- "st1 { v26.s }[2], [x26]\n"
- "st1 { v27.s }[2], [x25]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
+ "tbz x4, #1, 123f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "str d29, [x9], #0x8\n"
+ "tbz x4, #0, 124f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
+ "st1 { v29.s }[2], [x9]\n"
"b 124f\n"
"123:" // Height 6: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
- "str s26, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
- "str s28, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
+ "str s29, [x9, #0x0]\n"
"124:" // Height 6: Partial direct writeback: Done
"b 126f\n"
"125:" // Height 6: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
- "str q26, [x26, #0x0]\n"
- "str q27, [x25, #0x0]\n"
- "str q28, [x24, #0x0]\n"
- "str q29, [x23, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
+ "str q29, [x9, #0x0]\n"
"126:" // Height 6: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 107b\n"
"b 170f\n"
"127:" // Height 7
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
"128:" // Height 7: Column loop
- "cbz x15, 129f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 129f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1470,53 +1469,53 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"b 134f\n"
"129:" // Height 7: no bias
"tbz %x[flags], #0, 133f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
- "add x26, x27, x8, LSL #2\n"
- "add x25, x26, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
- "add x23, x24, x8, LSL #2\n"
- "add x22, x23, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "cmp x4, #0x4\n"
+ "add x28, x9, x26, LSL #2\n"
"bge 132f\n"
- "tbz x17, #1, 130f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "ldr d26, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "ldr d28, [x24], #0x8\n"
- "ldr d29, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
- "tbz x17, #0, 131f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
- "ld1 { v26.s }[2], [x26]\n"
- "ld1 { v27.s }[2], [x25]\n"
- "ld1 { v28.s }[2], [x24]\n"
- "ld1 { v29.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
+ "tbz x4, #1, 130f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "ldr d29, [x9], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "tbz x4, #0, 131f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x9]\n"
+ "ld1 { v30.s }[2], [x28]\n"
"b 131f\n"
"130:" // Height 7: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
- "ldr s26, [x26, #0x0]\n"
- "ldr s27, [x25, #0x0]\n"
- "ldr s28, [x24, #0x0]\n"
- "ldr s29, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
+ "ldr s29, [x9, #0x0]\n"
+ "ldr s30, [x28, #0x0]\n"
"131:" // Height 7: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 134f\n"
"132:" // Height 7: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
- "ldr q26, [x26, #0x0]\n"
- "ldr q27, [x25, #0x0]\n"
- "ldr q28, [x24, #0x0]\n"
- "ldr q29, [x23, #0x0]\n"
- "ldr q30, [x22, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q29, [x9, #0x0]\n"
+ "ldr q30, [x28, #0x0]\n"
"b 134f\n"
"133:" // Height 7: no accumulate
"movi v24.16b, #0x0\n"
@@ -1527,171 +1526,171 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"movi v29.16b, #0x0\n"
"movi v30.16b, #0x0\n"
"134:" // Height 7: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"135:" // Height 7: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 136f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x27, [x20, #0x10]\n"
- "ldr x25, [x20, #0x18]\n"
- "ldr x24, [x20, #0x20]\n"
- "ldr x23, [x20, #0x28]\n"
- "ldr x22, [x20, #0x30]\n"
- "cbnz x13, 137f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
- "add x27, x27, x8, LSL #2\n"
- "add x25, x25, x8, LSL #2\n"
- "add x24, x24, x8, LSL #2\n"
- "add x23, x23, x8, LSL #2\n"
- "add x22, x22, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "ldr x12, [x26, #0x28]\n"
+ "ldr x11, [x26, #0x30]\n"
+ "cbnz x7, 137f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
+ "add x12, x12, x26, LSL #2\n"
+ "add x11, x11, x26, LSL #2\n"
"b 137f\n"
"136:" // Height 7: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
- "add x27, x9, x8, LSL #2\n"
- "add x25, x27, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
- "add x23, x24, x8, LSL #2\n"
- "add x22, x23, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
+ "add x11, x12, x27, LSL #2\n"
"137:" // Height 7: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 140f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q5, [x23, #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q2, [x15, #0x0]\n"
+ "ldr q3, [x14, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q5, [x12, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 139f\n"
"138:" // Height 7: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr d10, [x16, #0x20]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "mov v9.d[1], x8\n"
+ "add x13, x13, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "ldr d11, [x16, #0x30]\n"
+ "add x12, x12, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "mov v10.d[1], x21\n"
+ "add x11, x11, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "ldr x8, [x16, #0x38]\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x11, x11, #0x10\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "ldr x26, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "mov v11.d[1], x8\n"
+ "ldr x10, [x5, #0x18]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "ldr x10, [x11, #0x8]\n"
+ "ldr x9, [x5, #0x28]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "add x9, x9, #0x10\n"
+ "ldr x28, [x17, #0x8]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x27, [x16, #0x8]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "add x27, x27, #0x10\n"
+ "sub x8, x8, #0x4\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "cmp x8, #0x8\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "ldr x26, [x27, #0x8]\n"
+ "mov v8.d[1], x26\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "add x25, x25, #0x10\n"
+ "ldr x26, [x15, #0x8]\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
- "ldr x8, [x25, #0x8]\n"
+ "ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x16, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
- "mov v0.d[1], x10\n"
+ "ldr d3, [x14, #0x0]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
- "mov v1.d[1], x28\n"
+ "ldr d4, [x13, #0x0]\n"
"fmla v29.4s, v11.4s, v5.s[3]\n"
- "mov v2.d[1], x26\n"
+ "ldr d5, [x12, #0x0]\n"
"fmla v30.4s, v11.4s, v6.s[3]\n"
- "ldr d3, [x25, #0x0]\n"
- "add x24, x24, #0x10\n"
- "add x23, x23, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "add x22, x22, #0x10\n"
- "mov v3.d[1], x8\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x12, x12, #0x4\n"
- "ldr d4, [x24, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr x21, [x24, #0x8]\n"
- "add x16, x16, #0x40\n"
- "ldr d8, [x16, #0x0]\n"
- "ldr x26, [x16, #0x8]\n"
- "mov v4.d[1], x21\n"
- "ldr d5, [x23, #0x0]\n"
- "ldr x8, [x23, #0x8]\n"
- "mov v8.d[1], x26\n"
- "ldr d6, [x22, #0x0]\n"
- "ldr x21, [x22, #0x8]\n"
- "mov v5.d[1], x8\n"
- "mov v6.d[1], x21\n"
+ "ldr d6, [x11, #0x0]\n"
+ "ldr d11, [x5, #0x30]\n"
+ "mov v9.d[1], x10\n"
+ "ldr x10, [x14, #0x8]\n"
+ "mov v10.d[1], x9\n"
+ "ldr x9, [x13, #0x8]\n"
+ "mov v0.d[1], x28\n"
+ "ldr x28, [x12, #0x8]\n"
+ "mov v1.d[1], x27\n"
+ "ldr x27, [x11, #0x8]\n"
+ "mov v2.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v3.d[1], x10\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
+ "mov v4.d[1], x9\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
+ "mov v5.d[1], x28\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "mov v6.d[1], x27\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"bge 138b\n"
"139:" // Height 7: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x16, #0x30]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "sub x12, x12, #0x4\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x11, x11, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "add x12, x12, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x27, x27, #0x10\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "add x22, x22, #0x10\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "add x16, x16, #0x40\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1702,50 +1701,48 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v29.4s, v11.4s, v5.s[3]\n"
"fmla v30.4s, v11.4s, v6.s[3]\n"
"140:" // Height 7: Multiply loop: Main loop skip
- "cbz x12, 142f\n"
+ "cbz x8, 142f\n"
"141:" // Height 7: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s5, [x23], #0x4\n"
- "ldr s6, [x22], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "fmla v29.4s, v12.4s, v5.s[0]\n"
- "fmla v30.4s, v12.4s, v6.s[0]\n"
- "cbnz x12, 141b\n"
+ "ldr s23, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s22, [x16], #0x4\n"
+ "ldr s21, [x15], #0x4\n"
+ "ldr s20, [x14], #0x4\n"
+ "ldr s19, [x13], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
+ "ldr s17, [x11], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v23.s[0]\n"
+ "fmla v25.4s, v16.4s, v22.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v26.4s, v16.4s, v21.s[0]\n"
+ "fmla v27.4s, v16.4s, v20.s[0]\n"
+ "fmla v28.4s, v16.4s, v19.s[0]\n"
+ "fmla v29.4s, v16.4s, v18.s[0]\n"
+ "fmla v30.4s, v16.4s, v17.s[0]\n"
+ "cbnz x8, 141b\n"
"142:" // Height 7: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 135b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x8, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x8, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x8, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x8, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x8, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
"tbz %x[flags], #1, 143f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
"fmin v26.4s, v26.4s, v16.4s\n"
@@ -1753,70 +1750,72 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmin v28.4s, v28.4s, v16.4s\n"
"fmin v29.4s, v29.4s, v16.4s\n"
"fmin v30.4s, v30.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
- "fmax v26.4s, v26.4s, v17.4s\n"
- "fmax v27.4s, v27.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v17.4s\n"
- "fmax v30.4s, v30.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v16.4s\n"
+ "fmax v27.4s, v27.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v16.4s\n"
+ "fmax v30.4s, v30.4s, v16.4s\n"
"143:" // Height 7: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 146f\n"
- "tbz x17, #1, 144f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "str d26, [x26], #0x8\n"
- "str d27, [x25], #0x8\n"
- "str d28, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
- "tbz x17, #0, 145f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
- "st1 { v26.s }[2], [x26]\n"
- "st1 { v27.s }[2], [x25]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
+ "tbz x4, #1, 144f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "str d29, [x9], #0x8\n"
+ "str d30, [x28], #0x8\n"
+ "tbz x4, #0, 145f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
+ "st1 { v29.s }[2], [x9]\n"
+ "st1 { v30.s }[2], [x28]\n"
"b 145f\n"
"144:" // Height 7: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
- "str s26, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
- "str s28, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
+ "str s29, [x9, #0x0]\n"
+ "str s30, [x28, #0x0]\n"
"145:" // Height 7: Partial direct writeback: Done
"b 147f\n"
"146:" // Height 7: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
- "str q26, [x26, #0x0]\n"
- "str q27, [x25, #0x0]\n"
- "str q28, [x24, #0x0]\n"
- "str q29, [x23, #0x0]\n"
- "str q30, [x22, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
+ "str q29, [x9, #0x0]\n"
+ "str q30, [x28, #0x0]\n"
"147:" // Height 7: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 128b\n"
"b 170f\n"
"148:" // Height 8
- "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x15, %x[bias]\n"
- "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "mov x8, #0x20\n"
- "madd %x[output_ptr], x20, x8, %x[output_ptr]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x26, #0x20\n"
+ "mov x3, %x[bias]\n"
+ "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x6, %x[output_ptr]\n"
+ "madd %x[output_ptr], x27, x26, %x[output_ptr]\n"
"149:" // Height 8: Column loop
- "cbz x15, 150f\n"
- "ldr q24, [x15, #0x0]\n"
- "add x15, x15, #0x10\n"
+ "cbz x3, 150f\n"
+ "ldr q24, [x3, #0x0]\n"
"mov v25.16b, v24.16b\n"
"mov v26.16b, v24.16b\n"
+ "add x3, x3, #0x10\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
@@ -1825,58 +1824,58 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"b 155f\n"
"150:" // Height 8: no bias
"tbz %x[flags], #0, 154f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "cmp x17, #0x4\n"
- "add x27, x14, x8, LSL #2\n"
- "add x26, x27, x8, LSL #2\n"
- "add x25, x26, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
- "add x23, x24, x8, LSL #2\n"
- "add x22, x23, x8, LSL #2\n"
- "add x21, x22, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
+ "cmp x4, #0x4\n"
+ "add x27, x28, x26, LSL #2\n"
"bge 153f\n"
- "tbz x17, #1, 151f\n"
- "ldr d24, [x14], #0x8\n"
- "ldr d25, [x27], #0x8\n"
- "mov x8, #0x8\n"
- "ldr d26, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
- "ldr d28, [x24], #0x8\n"
- "ldr d29, [x23], #0x8\n"
- "ldr d30, [x22], #0x8\n"
- "ldr d31, [x21], #0x8\n"
- "tbz x17, #0, 152f\n"
- "ld1 { v24.s }[2], [x14]\n"
- "ld1 { v25.s }[2], [x27]\n"
- "ld1 { v26.s }[2], [x26]\n"
- "ld1 { v27.s }[2], [x25]\n"
- "ld1 { v28.s }[2], [x24]\n"
- "ld1 { v29.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x22]\n"
- "ld1 { v31.s }[2], [x21]\n"
+ "tbz x4, #1, 151f\n"
+ "ldr d24, [x6], #0x8\n"
+ "mov x26, #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d27, [x11], #0x8\n"
+ "ldr d28, [x10], #0x8\n"
+ "ldr d29, [x9], #0x8\n"
+ "ldr d30, [x28], #0x8\n"
+ "ldr d31, [x27], #0x8\n"
+ "tbz x4, #0, 152f\n"
+ "ld1 { v24.s }[2], [x6]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
+ "ld1 { v27.s }[2], [x11]\n"
+ "ld1 { v28.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x9]\n"
+ "ld1 { v30.s }[2], [x28]\n"
+ "ld1 { v31.s }[2], [x27]\n"
"b 152f\n"
"151:" // Height 8: Partial accumulate: partial_1_0
- "ldr s24, [x14, #0x0]\n"
- "mov x8, #0x0\n"
- "ldr s25, [x27, #0x0]\n"
- "ldr s26, [x26, #0x0]\n"
- "ldr s27, [x25, #0x0]\n"
- "ldr s28, [x24, #0x0]\n"
- "ldr s29, [x23, #0x0]\n"
- "ldr s30, [x22, #0x0]\n"
- "ldr s31, [x21, #0x0]\n"
+ "ldr s24, [x6, #0x0]\n"
+ "mov x26, #0x0\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x12, #0x0]\n"
+ "ldr s27, [x11, #0x0]\n"
+ "ldr s28, [x10, #0x0]\n"
+ "ldr s29, [x9, #0x0]\n"
+ "ldr s30, [x28, #0x0]\n"
+ "ldr s31, [x27, #0x0]\n"
"152:" // Height 8: Partial accumulate: Done
- "sub x14, x14, x8\n"
+ "sub x6, x6, x26\n"
"b 155f\n"
"153:" // Height 8: full accumulate
- "ldr q24, [x14, #0x0]\n"
- "ldr q25, [x27, #0x0]\n"
- "ldr q26, [x26, #0x0]\n"
- "ldr q27, [x25, #0x0]\n"
- "ldr q28, [x24, #0x0]\n"
- "ldr q29, [x23, #0x0]\n"
- "ldr q30, [x22, #0x0]\n"
- "ldr q31, [x21, #0x0]\n"
+ "ldr q24, [x6, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q27, [x11, #0x0]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q29, [x9, #0x0]\n"
+ "ldr q30, [x28, #0x0]\n"
+ "ldr q31, [x27, #0x0]\n"
"b 155f\n"
"154:" // Height 8: no accumulate
"movi v24.16b, #0x0\n"
@@ -1888,188 +1887,188 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"movi v30.16b, #0x0\n"
"movi v31.16b, #0x0\n"
"155:" // Height 8: setup done
- "mov x13, #0x0\n"
+ "mov x7, #0x0\n"
"156:" // Height 8: String loop
- "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w12, [x20, x13, LSL #0x2]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w8, [x26, x7, LSL #0x2]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 157f\n"
- "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
- "add x20, x20, x8, LSL #3\n"
- "ldr x11, [x20, #0x0]\n"
- "ldr x9, [x20, #0x8]\n"
- "ldr x27, [x20, #0x10]\n"
- "ldr x25, [x20, #0x18]\n"
- "ldr x24, [x20, #0x20]\n"
- "ldr x23, [x20, #0x28]\n"
- "ldr x22, [x20, #0x30]\n"
- "ldr x20, [x20, #0x38]\n"
- "cbnz x13, 158f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x11, x11, x8, LSL #2\n"
- "add x9, x9, x8, LSL #2\n"
- "add x27, x27, x8, LSL #2\n"
- "add x25, x25, x8, LSL #2\n"
- "add x24, x24, x8, LSL #2\n"
- "add x23, x23, x8, LSL #2\n"
- "add x22, x22, x8, LSL #2\n"
- "add x20, x20, x8, LSL #2\n"
+ "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+ "add x26, x26, x27, LSL #3\n"
+ "ldr x17, [x26, #0x0]\n"
+ "ldr x16, [x26, #0x8]\n"
+ "ldr x15, [x26, #0x10]\n"
+ "ldr x14, [x26, #0x18]\n"
+ "ldr x13, [x26, #0x20]\n"
+ "ldr x12, [x26, #0x28]\n"
+ "ldr x11, [x26, #0x30]\n"
+ "ldr x27, [x26, #0x38]\n"
+ "cbnz x7, 158f\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x17, x17, x26, LSL #2\n"
+ "add x16, x16, x26, LSL #2\n"
+ "add x15, x15, x26, LSL #2\n"
+ "add x14, x14, x26, LSL #2\n"
+ "add x13, x13, x26, LSL #2\n"
+ "add x12, x12, x26, LSL #2\n"
+ "add x11, x11, x26, LSL #2\n"
+ "add x27, x27, x26, LSL #2\n"
"b 158f\n"
"157:" // Height 8: setup direct input
- "mov x11, %x[input_ptr]\n"
- "add x9, x11, x8, LSL #2\n"
- "add x27, x9, x8, LSL #2\n"
- "add x25, x27, x8, LSL #2\n"
- "add x24, x25, x8, LSL #2\n"
- "add x23, x24, x8, LSL #2\n"
- "add x22, x23, x8, LSL #2\n"
- "add x20, x22, x8, LSL #2\n"
+ "mov x17, %x[input_ptr]\n"
+ "add x16, x17, x27, LSL #2\n"
+ "add x15, x16, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
+ "add x11, x12, x27, LSL #2\n"
+ "add x27, x11, x27, LSL #2\n"
"158:" // Height 8: input setup done
- "cmp x12, #0x4\n"
+ "cmp x8, #0x4\n"
"blt 161f\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q1, [x9, #0x0]\n"
- "cmp x12, #0x8\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x24, #0x0]\n"
- "ldr q5, [x23, #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q8, [x16, #0x0]\n"
+ "ldr q0, [x17, #0x0]\n"
+ "cmp x8, #0x8\n"
+ "ldr q1, [x16, #0x0]\n"
+ "ldr q2, [x15, #0x0]\n"
+ "ldr q3, [x14, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q5, [x12, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ "ldr q7, [x27, #0x0]\n"
+ "ldr q8, [x5, #0x0]\n"
+ "ldr q9, [x5, #0x10]\n"
+ "ldr q10, [x5, #0x20]\n"
+ "ldr q11, [x5, #0x30]\n"
"blt 160f\n"
"159:" // Height 8: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr d9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr x8, [x16, #0x18]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr d10, [x16, #0x20]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "ldr x21, [x16, #0x28]\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "mov v9.d[1], x8\n"
+ "add x13, x13, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "ldr d11, [x16, #0x30]\n"
+ "add x12, x12, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "mov v10.d[1], x21\n"
+ "add x11, x11, #0x10\n"
"fmla v31.4s, v8.4s, v7.s[0]\n"
- "ldr x8, [x16, #0x38]\n"
+ "add x27, x27, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "add x11, x11, #0x10\n"
+ "add x5, x5, #0x40\n"
+ "ldr d8, [x5, #0x0]\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "mov v11.d[1], x8\n"
+ "ldr x26, [x5, #0x8]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "ldr x10, [x11, #0x8]\n"
+ "sub x8, x8, #0x4\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "add x9, x9, #0x10\n"
+ "cmp x8, #0x8\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v8.d[1], x26\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x26, [x5, #0x18]\n"
"fmla v31.4s, v9.4s, v7.s[1]\n"
- "add x27, x27, #0x10\n"
+ "ldr d9, [x5, #0x10]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "ldr x26, [x27, #0x8]\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "mov v9.d[1], x26\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "ldr x8, [x25, #0x8]\n"
+ "ldr x26, [x5, #0x28]\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v31.4s, v10.4s, v7.s[2]\n"
- "ldr x21, [x24, #0x8]\n"
+ "ldr d10, [x5, #0x20]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "ldr d0, [x11, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x16, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x15, #0x0]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
- "mov v0.d[1], x10\n"
+ "ldr d3, [x14, #0x0]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
- "mov v1.d[1], x28\n"
+ "ldr d4, [x13, #0x0]\n"
"fmla v29.4s, v11.4s, v5.s[3]\n"
- "mov v2.d[1], x26\n"
+ "ldr d5, [x12, #0x0]\n"
"fmla v30.4s, v11.4s, v6.s[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ "ldr d6, [x11, #0x0]\n"
"fmla v31.4s, v11.4s, v7.s[3]\n"
- "ldr d4, [x24, #0x0]\n"
- "add x23, x23, #0x10\n"
- "add x22, x22, #0x10\n"
- "mov v3.d[1], x8\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "mov v4.d[1], x21\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "ldr d5, [x23, #0x0]\n"
- "add x20, x20, #0x10\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "sub x12, x12, #0x4\n"
- "ldr x8, [x23, #0x8]\n"
- "cmp x12, #0x8\n"
- "ldr d6, [x22, #0x0]\n"
- "add x16, x16, #0x40\n"
- "ldr d8, [x16, #0x0]\n"
- "mov v5.d[1], x8\n"
+ "ldr d7, [x27, #0x0]\n"
+ "ldr d11, [x5, #0x30]\n"
+ "mov v10.d[1], x26\n"
+ "ldr x26, [x17, #0x8]\n"
+ "mov v0.d[1], x26\n"
"ldr x26, [x16, #0x8]\n"
- "ldr x21, [x22, #0x8]\n"
- "ldr d7, [x20, #0x0]\n"
- "mov v8.d[1], x26\n"
- "ldr x8, [x20, #0x8]\n"
- "mov v6.d[1], x21\n"
- "mov v7.d[1], x8\n"
+ "mov v1.d[1], x26\n"
+ "ldr x26, [x15, #0x8]\n"
+ "mov v2.d[1], x26\n"
+ "ldr x26, [x14, #0x8]\n"
+ "mov v3.d[1], x26\n"
+ "ldr x26, [x13, #0x8]\n"
+ "mov v4.d[1], x26\n"
+ "ldr x26, [x12, #0x8]\n"
+ "mov v5.d[1], x26\n"
+ "ldr x26, [x11, #0x8]\n"
+ "mov v6.d[1], x26\n"
+ "ldr x26, [x27, #0x8]\n"
+ "mov v7.d[1], x26\n"
+ "ldr x26, [x5, #0x38]\n"
+ "mov v11.d[1], x26\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 159b\n"
"160:" // Height 8: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x16, #0x10]\n"
+ "add x17, x17, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x16, #0x20]\n"
+ "add x16, x16, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x16, #0x30]\n"
+ "add x15, x15, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "sub x12, x12, #0x4\n"
+ "add x14, x14, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x11, x11, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "prfm pldl1keep, [x11, #0x80]\n"
+ "add x12, x12, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
"fmla v31.4s, v8.4s, v7.s[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "fmla v24.4s, v9.4s, v0.s[1]\n"
"add x27, x27, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "sub x8, x8, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x17, #0x80]\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x16, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x15, #0x80]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x13, #0x80]\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
- "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"fmla v31.4s, v9.4s, v7.s[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "add x5, x5, #0x40\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
- "add x20, x20, #0x10\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
"fmla v28.4s, v10.4s, v4.s[2]\n"
- "add x16, x16, #0x40\n"
"fmla v29.4s, v10.4s, v5.s[2]\n"
"fmla v30.4s, v10.4s, v6.s[2]\n"
"fmla v31.4s, v10.4s, v7.s[2]\n"
@@ -2082,54 +2081,52 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmla v30.4s, v11.4s, v6.s[3]\n"
"fmla v31.4s, v11.4s, v7.s[3]\n"
"161:" // Height 8: Multiply loop: Main loop skip
- "cbz x12, 163f\n"
+ "cbz x8, 163f\n"
"162:" // Height 8: Multiply loop: Odd block loop
- "ldr s0, [x11], #0x4\n"
- "sub x12, x12, #0x1\n"
- "ldr s1, [x9], #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s5, [x23], #0x4\n"
- "ldr s6, [x22], #0x4\n"
- "ldr s7, [x20], #0x4\n"
- "ldr q12, [x16, #0x0]\n"
- "add x16, x16, #0x10\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "fmla v29.4s, v12.4s, v5.s[0]\n"
- "fmla v30.4s, v12.4s, v6.s[0]\n"
- "fmla v31.4s, v12.4s, v7.s[0]\n"
- "cbnz x12, 162b\n"
+ "ldr s0, [x17], #0x4\n"
+ "sub x8, x8, #0x1\n"
+ "ldr s23, [x16], #0x4\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s21, [x14], #0x4\n"
+ "ldr s20, [x13], #0x4\n"
+ "ldr s19, [x12], #0x4\n"
+ "ldr s18, [x11], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr q16, [x5, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "fmla v25.4s, v16.4s, v23.s[0]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v26.4s, v16.4s, v22.s[0]\n"
+ "fmla v27.4s, v16.4s, v21.s[0]\n"
+ "fmla v28.4s, v16.4s, v20.s[0]\n"
+ "fmla v29.4s, v16.4s, v19.s[0]\n"
+ "fmla v30.4s, v16.4s, v18.s[0]\n"
+ "fmla v31.4s, v16.4s, v17.s[0]\n"
+ "cbnz x8, 162b\n"
"163:" // Height 8: Multiply loop: No odd multiplies
- "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x13, x13, #0x1\n"
- "cmp x13, x8\n"
+ "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x7, x7, #0x1\n"
+ "cmp x7, x26\n"
"bne 156b\n"
- "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "add x27, x14, x8, LSL #2\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x13, x6, x26, LSL #2\n"
+ "add x12, x13, x26, LSL #2\n"
+ "add x11, x12, x26, LSL #2\n"
+ "add x10, x11, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x28, x9, x26, LSL #2\n"
+ "add x27, x28, x26, LSL #2\n"
+ "prfm pstl1keep, [x6, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x26, x27, x8, LSL #2\n"
- "prfm pstl1keep, [x26, #0x0]\n"
- "add x25, x26, x8, LSL #2\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "add x24, x25, x8, LSL #2\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add x23, x24, x8, LSL #2\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add x22, x23, x8, LSL #2\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "add x21, x22, x8, LSL #2\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"tbz %x[flags], #1, 164f\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
- "add x8, %x[args_ptr], %[offset_max]\n"
- "ld1r { v17.4s }, [x20]\n"
- "ld1r { v16.4s }, [x8]\n"
+ "add x26, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x26]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmin v25.4s, v25.4s, v16.4s\n"
"fmin v26.4s, v26.4s, v16.4s\n"
@@ -2138,76 +2135,77 @@ void a64_hybrid_fp32_mla_8x4_a55 (
"fmin v29.4s, v29.4s, v16.4s\n"
"fmin v30.4s, v30.4s, v16.4s\n"
"fmin v31.4s, v31.4s, v16.4s\n"
- "fmax v24.4s, v24.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v17.4s\n"
- "fmax v26.4s, v26.4s, v17.4s\n"
- "fmax v27.4s, v27.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v17.4s\n"
- "fmax v30.4s, v30.4s, v17.4s\n"
- "fmax v31.4s, v31.4s, v17.4s\n"
+ "add x26, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v16.4s }, [x26]\n"
+ "fmax v24.4s, v24.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v16.4s\n"
+ "fmax v27.4s, v27.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v16.4s\n"
+ "fmax v30.4s, v30.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v16.4s\n"
"164:" // Height 8: No activation
- "cmp x17, #0x4\n"
+ "cmp x4, #0x4\n"
"bge 167f\n"
- "tbz x17, #1, 165f\n"
- "str d24, [x14], #0x8\n"
- "str d25, [x27], #0x8\n"
- "str d26, [x26], #0x8\n"
- "str d27, [x25], #0x8\n"
- "str d28, [x24], #0x8\n"
- "str d29, [x23], #0x8\n"
- "str d30, [x22], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x17, #0, 166f\n"
- "st1 { v24.s }[2], [x14]\n"
- "st1 { v25.s }[2], [x27]\n"
- "st1 { v26.s }[2], [x26]\n"
- "st1 { v27.s }[2], [x25]\n"
- "st1 { v28.s }[2], [x24]\n"
- "st1 { v29.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x22]\n"
- "st1 { v31.s }[2], [x21]\n"
+ "tbz x4, #1, 165f\n"
+ "str d24, [x6], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x12], #0x8\n"
+ "str d27, [x11], #0x8\n"
+ "str d28, [x10], #0x8\n"
+ "str d29, [x9], #0x8\n"
+ "str d30, [x28], #0x8\n"
+ "str d31, [x27], #0x8\n"
+ "tbz x4, #0, 166f\n"
+ "st1 { v24.s }[2], [x6]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x12]\n"
+ "st1 { v27.s }[2], [x11]\n"
+ "st1 { v28.s }[2], [x10]\n"
+ "st1 { v29.s }[2], [x9]\n"
+ "st1 { v30.s }[2], [x28]\n"
+ "st1 { v31.s }[2], [x27]\n"
"b 166f\n"
"165:" // Height 8: Partial direct writeback: partial_1_0
- "str s24, [x14, #0x0]\n"
- "str s25, [x27, #0x0]\n"
- "str s26, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
- "str s28, [x24, #0x0]\n"
- "str s29, [x23, #0x0]\n"
- "str s30, [x22, #0x0]\n"
- "str s31, [x21, #0x0]\n"
+ "str s24, [x6, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x12, #0x0]\n"
+ "str s27, [x11, #0x0]\n"
+ "str s28, [x10, #0x0]\n"
+ "str s29, [x9, #0x0]\n"
+ "str s30, [x28, #0x0]\n"
+ "str s31, [x27, #0x0]\n"
"166:" // Height 8: Partial direct writeback: Done
"b 168f\n"
"167:" // Height 8: Full writeback
- "str q24, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q25, [x27, #0x0]\n"
- "str q26, [x26, #0x0]\n"
- "str q27, [x25, #0x0]\n"
- "str q28, [x24, #0x0]\n"
- "str q29, [x23, #0x0]\n"
- "str q30, [x22, #0x0]\n"
- "str q31, [x21, #0x0]\n"
+ "str q24, [x6, #0x0]\n"
+ "add x6, x6, #0x10\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x12, #0x0]\n"
+ "str q27, [x11, #0x0]\n"
+ "str q28, [x10, #0x0]\n"
+ "str q29, [x9, #0x0]\n"
+ "str q30, [x28, #0x0]\n"
+ "str q31, [x27, #0x0]\n"
"168:" // Height 8: Writeback done
- "subs x17, x17, #0x4\n"
+ "subs x4, x4, #0x4\n"
"bgt 149b\n"
"subs %x[M], %x[M], #0x8\n"
"beq 170f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 169f\n"
- "add x20, x20, #0x8\n"
- "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "add x27, x27, #0x8\n"
+ "str x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
"169:" // Update direct input
- "mov x8, #0x20\n"
- "madd %x[input_ptr], x8, x20, %x[input_ptr]\n"
+ "mov x26, #0x20\n"
+ "madd %x[input_ptr], x26, x27, %x[input_ptr]\n"
"b 1b\n"
"170:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x8", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
index bd22336c8d..004e5d7f23 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_8x4 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x8\n"
"bge 148f\n"
@@ -140,11 +139,11 @@ void a64_hybrid_fp32_mla_8x4 (
"9:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 10f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
"cbnz x10, 11f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -189,10 +188,10 @@ void a64_hybrid_fp32_mla_8x4 (
"14:" // Height 1: Multiply loop: Main loop skip
"cbz x9, 16f\n"
"15:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
+ "ldr s17, [x28], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
"sub x9, x9, #0x1\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v24.4s, v16.4s, v17.s[0]\n"
"add x12, x12, #0x10\n"
"cbnz x9, 15b\n"
"16:" // Height 1: Multiply loop: No odd multiplies
@@ -271,12 +270,12 @@ void a64_hybrid_fp32_mla_8x4 (
"30:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
"cbnz x10, 32f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -284,7 +283,7 @@ void a64_hybrid_fp32_mla_8x4 (
"b 32f\n"
"31:" // Height 2: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
"32:" // Height 2: input setup done
"cmp x9, #0x4\n"
"blt 35f\n"
@@ -337,12 +336,12 @@ void a64_hybrid_fp32_mla_8x4 (
"35:" // Height 2: Multiply loop: Main loop skip
"cbz x9, 37f\n"
"36:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s18, [x28], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v18.s[0]\n"
+ "fmla v25.4s, v16.4s, v17.s[0]\n"
"add x12, x12, #0x10\n"
"cbnz x9, 36b\n"
"37:" // Height 2: Multiply loop: No odd multiplies
@@ -437,13 +436,13 @@ void a64_hybrid_fp32_mla_8x4 (
"51:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 52f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
"cbnz x10, 53f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -452,8 +451,8 @@ void a64_hybrid_fp32_mla_8x4 (
"b 53f\n"
"52:" // Height 3: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
"53:" // Height 3: input setup done
"cmp x9, #0x4\n"
"blt 56f\n"
@@ -520,14 +519,14 @@ void a64_hybrid_fp32_mla_8x4 (
"56:" // Height 3: Multiply loop: Main loop skip
"cbz x9, 58f\n"
"57:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s19, [x28], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr s2, [x26], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v19.s[0]\n"
+ "fmla v25.4s, v16.4s, v18.s[0]\n"
+ "fmla v26.4s, v16.4s, v17.s[0]\n"
"add x12, x12, #0x10\n"
"cbnz x9, 57b\n"
"58:" // Height 3: Multiply loop: No odd multiplies
@@ -637,14 +636,14 @@ void a64_hybrid_fp32_mla_8x4 (
"72:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
"cbnz x10, 74f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -654,9 +653,9 @@ void a64_hybrid_fp32_mla_8x4 (
"b 74f\n"
"73:" // Height 4: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"74:" // Height 4: input setup done
"cmp x9, #0x4\n"
"blt 77f\n"
@@ -737,17 +736,17 @@ void a64_hybrid_fp32_mla_8x4 (
"77:" // Height 4: Multiply loop: Main loop skip
"cbz x9, 79f\n"
"78:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s20, [x28], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr s17, [x25], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v20.s[0]\n"
+ "fmla v25.4s, v16.4s, v19.s[0]\n"
"add x12, x12, #0x10\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "fmla v26.4s, v16.4s, v18.s[0]\n"
+ "fmla v27.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 78b\n"
"79:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -871,15 +870,15 @@ void a64_hybrid_fp32_mla_8x4 (
"93:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 94f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
"cbnz x10, 95f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -890,10 +889,10 @@ void a64_hybrid_fp32_mla_8x4 (
"b 95f\n"
"94:" // Height 5: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"95:" // Height 5: input setup done
"cmp x9, #0x4\n"
"blt 98f\n"
@@ -988,19 +987,19 @@ void a64_hybrid_fp32_mla_8x4 (
"98:" // Height 5: Multiply loop: Main loop skip
"cbz x9, 100f\n"
"99:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s21, [x28], #0x4\n"
+ "ldr s20, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v21.s[0]\n"
+ "fmla v25.4s, v16.4s, v20.s[0]\n"
+ "fmla v26.4s, v16.4s, v19.s[0]\n"
+ "fmla v27.4s, v16.4s, v18.s[0]\n"
"add x12, x12, #0x10\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "fmla v28.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 99b\n"
"100:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1139,16 +1138,16 @@ void a64_hybrid_fp32_mla_8x4 (
"114:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 115f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
"cbnz x10, 116f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -1160,11 +1159,11 @@ void a64_hybrid_fp32_mla_8x4 (
"b 116f\n"
"115:" // Height 6: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"116:" // Height 6: input setup done
"cmp x9, #0x4\n"
"blt 119f\n"
@@ -1273,21 +1272,21 @@ void a64_hybrid_fp32_mla_8x4 (
"119:" // Height 6: Multiply loop: Main loop skip
"cbz x9, 121f\n"
"120:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s22, [x28], #0x4\n"
+ "ldr s21, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s5, [x23], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr s20, [x26], #0x4\n"
+ "ldr s19, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s17, [x23], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v22.s[0]\n"
+ "fmla v25.4s, v16.4s, v21.s[0]\n"
"add x12, x12, #0x10\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "fmla v26.4s, v16.4s, v20.s[0]\n"
+ "fmla v27.4s, v16.4s, v19.s[0]\n"
+ "fmla v28.4s, v16.4s, v18.s[0]\n"
+ "fmla v29.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 120b\n"
"121:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1441,17 +1440,17 @@ void a64_hybrid_fp32_mla_8x4 (
"135:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 136f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
- "ldr x22, [x21, #0x30]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
"cbnz x10, 137f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -1464,12 +1463,12 @@ void a64_hybrid_fp32_mla_8x4 (
"b 137f\n"
"136:" // Height 7: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"137:" // Height 7: input setup done
"cmp x9, #0x4\n"
"blt 140f\n"
@@ -1592,23 +1591,23 @@ void a64_hybrid_fp32_mla_8x4 (
"140:" // Height 7: Multiply loop: Main loop skip
"cbz x9, 142f\n"
"141:" // Height 7: Multiply loop: Odd block loop
- "ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s23, [x28], #0x4\n"
+ "ldr s22, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s5, [x23], #0x4\n"
- "ldr s6, [x22], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "ldr s21, [x26], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s18, [x23], #0x4\n"
+ "ldr s17, [x22], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v23.s[0]\n"
+ "fmla v25.4s, v16.4s, v22.s[0]\n"
+ "fmla v26.4s, v16.4s, v21.s[0]\n"
+ "fmla v27.4s, v16.4s, v20.s[0]\n"
"add x12, x12, #0x10\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "fmla v29.4s, v12.4s, v5.s[0]\n"
- "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "fmla v28.4s, v16.4s, v19.s[0]\n"
+ "fmla v29.4s, v16.4s, v18.s[0]\n"
+ "fmla v30.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 141b\n"
"142:" // Height 7: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1780,18 +1779,18 @@ void a64_hybrid_fp32_mla_8x4 (
"156:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 157f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
- "ldr x22, [x21, #0x30]\n"
- "ldr x21, [x21, #0x38]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x38]\n"
"cbnz x10, 158f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -1805,13 +1804,13 @@ void a64_hybrid_fp32_mla_8x4 (
"b 158f\n"
"157:" // Height 8: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"158:" // Height 8: input setup done
"cmp x9, #0x4\n"
"blt 161f\n"
@@ -1949,24 +1948,24 @@ void a64_hybrid_fp32_mla_8x4 (
"cbz x9, 163f\n"
"162:" // Height 8: Multiply loop: Odd block loop
"ldr s0, [x28], #0x4\n"
- "ldr s1, [x27], #0x4\n"
+ "ldr s23, [x27], #0x4\n"
"sub x9, x9, #0x1\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x24], #0x4\n"
- "ldr s5, [x23], #0x4\n"
- "ldr s6, [x22], #0x4\n"
- "ldr s7, [x21], #0x4\n"
- "ldr q12, [x12, #0x0]\n"
- "fmla v24.4s, v12.4s, v0.s[0]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr s22, [x26], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "ldr s20, [x24], #0x4\n"
+ "ldr s19, [x23], #0x4\n"
+ "ldr s18, [x22], #0x4\n"
+ "ldr s17, [x21], #0x4\n"
+ "ldr q16, [x12, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "fmla v25.4s, v16.4s, v23.s[0]\n"
"add x12, x12, #0x10\n"
- "fmla v26.4s, v12.4s, v2.s[0]\n"
- "fmla v27.4s, v12.4s, v3.s[0]\n"
- "fmla v28.4s, v12.4s, v4.s[0]\n"
- "fmla v29.4s, v12.4s, v5.s[0]\n"
- "fmla v30.4s, v12.4s, v6.s[0]\n"
- "fmla v31.4s, v12.4s, v7.s[0]\n"
+ "fmla v26.4s, v16.4s, v22.s[0]\n"
+ "fmla v27.4s, v16.4s, v21.s[0]\n"
+ "fmla v28.4s, v16.4s, v20.s[0]\n"
+ "fmla v29.4s, v16.4s, v19.s[0]\n"
+ "fmla v30.4s, v16.4s, v18.s[0]\n"
+ "fmla v31.4s, v16.4s, v17.s[0]\n"
"cbnz x9, 162b\n"
"163:" // Height 8: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2068,10 +2067,9 @@ void a64_hybrid_fp32_mla_8x4 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"170:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index e6e7950979..f31dd7afd0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
index a0ea96822a..0e468b196a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -93,7 +93,6 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 130f\n"
@@ -255,11 +254,11 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"20:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 21f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 22f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -279,31 +278,31 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"23:" // Height 1: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q23, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q22, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ "ldr q21, [x28, #0x70]\n"
+ ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
+ "ldr q24, [x28, #0x80]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x90]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ "ldr q22, [x28, #0xa0]\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0xb0]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x8\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
"add x28, x28, #0xc0\n"
"ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x28, #0x20]\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"ldr q7, [x28, #0x30]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"ld1 { v0.4s }, [x24], #0x10\n"
@@ -311,28 +310,28 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"24:" // Height 1: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q23, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q25, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q21, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x6e57ec0a // bfmmla v10.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x80]\n"
+ ".inst 0x6e59ec10 // bfmmla v16.4s, v0.8h, v25.8h\n"
+ "ldr q22, [x28, #0x90]\n"
+ ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0xa0]\n"
+ ".inst 0x6e58ec11 // bfmmla v17.4s, v0.8h, v24.8h\n"
+ "ldr q5, [x28, #0xb0]\n"
"sub x25, x25, #0x4\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e57ec0c // bfmmla v12.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec12 // bfmmla v18.4s, v0.8h, v22.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec0d // bfmmla v13.4s, v0.8h, v21.8h\n"
+ ".inst 0x6e45ec13 // bfmmla v19.4s, v0.8h, v5.8h\n"
"25:" // Height 1: Multiply loop: Main loop skip
"cbz x25, 28f\n"
"cbz x25, 28f\n"
@@ -344,31 +343,31 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"26:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr s0, [x24, #0x0]\n"
"27:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q4, [x28, #0x40]\n"
- "ldr q5, [x28, #0x50]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q6, [x28, #0x60]\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q4, [x28, #0x80]\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- "ldr q7, [x28, #0xb0]\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec08 // bfmmla v8.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x20]\n"
+ "ldr q22, [x28, #0x30]\n"
+ ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e55ec09 // bfmmla v9.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x40]\n"
+ "ldr q23, [x28, #0x50]\n"
+ ".inst 0x6e56ec0f // bfmmla v15.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0a // bfmmla v10.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x60]\n"
+ "ldr q22, [x28, #0x70]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0x80]\n"
+ "ldr q23, [x28, #0x90]\n"
+ ".inst 0x6e56ec11 // bfmmla v17.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0c // bfmmla v12.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x28, #0xa0]\n"
+ "ldr q21, [x28, #0xb0]\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"add x28, x28, #0xc0\n"
"28:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -384,21 +383,21 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"uzp1 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 29f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v22.4s\n"
+ "fmin v9.4s, v9.4s, v22.4s\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "fmin v13.4s, v13.4s, v22.4s\n"
+ "fmax v8.4s, v8.4s, v21.4s\n"
+ "fmax v9.4s, v9.4s, v21.4s\n"
+ "fmax v10.4s, v10.4s, v21.4s\n"
+ "fmax v11.4s, v11.4s, v21.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
"29:" // Height 1: No activation
"cmp x9, #0x18\n"
"bge 42f\n"
@@ -678,12 +677,12 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"63:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 65f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -691,7 +690,7 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"b 65f\n"
"64:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"65:" // Height 2: input setup done
"cmp x25, #0x4\n"
"blt 68f\n"
@@ -707,31 +706,31 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q3, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q23, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q22, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ "ldr q21, [x28, #0x70]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ "ldr q1, [x28, #0x80]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x90]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ "ldr q22, [x28, #0xa0]\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0xb0]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x8\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e41ec0c // bfmmla v12.4s, v0.8h, v1.8h\n"
"ldr q4, [x28, #0x0]\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"ldr q5, [x28, #0x10]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
"ldr q6, [x28, #0x20]\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"ldr q7, [x28, #0x30]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"ld1 { v0.4s }, [x24], #0x10\n"
@@ -742,28 +741,28 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q23, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0x60]\n"
+ "ldr q22, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ "ldr q21, [x28, #0x70]\n"
+ ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
+ "ldr q24, [x28, #0x80]\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q23, [x28, #0x90]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ "ldr q22, [x28, #0xa0]\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q21, [x28, #0xb0]\n"
"sub x25, x25, #0x4\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"add x28, x28, #0xc0\n"
"68:" // Height 2: Multiply loop: Main loop skip
"cbz x25, 71f\n"
@@ -779,32 +778,32 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr s0, [x24, #0x0]\n"
"ldr s1, [x23, #0x0]\n"
"70:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q23, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- "ldr q4, [x28, #0x40]\n"
- "ldr q5, [x28, #0x50]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- "ldr q6, [x28, #0x60]\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- "ldr q4, [x28, #0x80]\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- "ldr q7, [x28, #0xb0]\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ "ldr q22, [x28, #0x20]\n"
+ "ldr q21, [x28, #0x30]\n"
+ ".inst 0x6e58ec08 // bfmmla v8.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec0e // bfmmla v14.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x28, #0x40]\n"
+ "ldr q23, [x28, #0x50]\n"
+ ".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x28, #0x60]\n"
+ "ldr q21, [x28, #0x70]\n"
+ ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n"
+ "ldr q24, [x28, #0x80]\n"
+ "ldr q23, [x28, #0x90]\n"
+ ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n"
+ "ldr q22, [x28, #0xa0]\n"
+ "ldr q21, [x28, #0xb0]\n"
+ ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n"
+ ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n"
+ ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n"
+ ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n"
"add x28, x28, #0xc0\n"
"71:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -829,33 +828,33 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"uzp2 v13.2d, v13.2d, v19.2d\n"
"tbz %x[flags], #1, 72f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v22.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v4.4s, v4.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmax v4.4s, v4.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "fmin v4.4s, v4.4s, v22.4s\n"
+ "fmin v14.4s, v14.4s, v22.4s\n"
+ "fmin v15.4s, v15.4s, v22.4s\n"
+ "fmin v16.4s, v16.4s, v22.4s\n"
+ "fmin v17.4s, v17.4s, v22.4s\n"
+ "fmin v18.4s, v18.4s, v22.4s\n"
+ "fmin v8.4s, v8.4s, v22.4s\n"
+ "fmin v9.4s, v9.4s, v22.4s\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "fmin v13.4s, v13.4s, v22.4s\n"
+ "fmax v4.4s, v4.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
+ "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmax v8.4s, v8.4s, v21.4s\n"
+ "fmax v9.4s, v9.4s, v21.4s\n"
+ "fmax v10.4s, v10.4s, v21.4s\n"
+ "fmax v11.4s, v11.4s, v21.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
"72:" // Height 2: No activation
"cmp x9, #0x18\n"
"bge 85f\n"
@@ -1238,13 +1237,13 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"106:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 107f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 108f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -1253,8 +1252,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"b 108f\n"
"107:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"108:" // Height 3: input setup done
"cmp x25, #0x4\n"
"blt 111f\n"
@@ -1285,7 +1284,7 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
+ "ldr q3, [x28, #0x70]\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
"ld1 { v1.4s }, [x23], #0x10\n"
@@ -1298,9 +1297,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ ".inst 0x6e43ec11 // bfmmla v17.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec5d // bfmmla v29.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x28, #0xb0]\n"
"add x28, x28, #0xc0\n"
".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
@@ -1311,9 +1310,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0x20]\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n"
"ld1 { v0.4s }, [x24], #0x10\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec5f // bfmmla v31.4s, v2.8h, v3.8h\n"
"ld1 { v2.4s }, [x22], #0x10\n"
"ldr q7, [x28, #0x30]\n"
"bge 109b\n"
@@ -1324,10 +1323,10 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"sub x25, x25, #0x4\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q3, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q4, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
@@ -1335,29 +1334,29 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
+ ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x90]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ "ldr q3, [x28, #0xa0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x28, #0xb0]\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"111:" // Height 3: Multiply loop: Main loop skip
"cbz x25, 114f\n"
"cbz x25, 114f\n"
@@ -1375,46 +1374,46 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr s1, [x23, #0x0]\n"
"ldr s2, [x22, #0x0]\n"
"113:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
+ "ldr q5, [x28, #0x0]\n"
+ "ldr q4, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
+ "ldr q3, [x28, #0x20]\n"
+ "ldr q1, [x28, #0x30]\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x28, #0x60]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x28, #0x40]\n"
+ ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x28, #0x60]\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5b // bfmmla v27.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x90]\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x28, #0xa0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x28, #0xb0]\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"114:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -1937,14 +1936,14 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"149:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 150f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 151f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -1954,9 +1953,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"b 151f\n"
"150:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"151:" // Height 4: input setup done
"cmp x25, #0x4\n"
"blt 154f\n"
@@ -2033,39 +2032,39 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
+ "ldr q3, [x28, #0x40]\n"
".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
+ "ldr q4, [x28, #0x50]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
"ldr q6, [x28, #0x60]\n"
".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x90]\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ "ldr q3, [x28, #0xa0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x28, #0xb0]\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"154:" // Height 4: Multiply loop: Main loop skip
"cbz x25, 157f\n"
"cbz x25, 157f\n"
@@ -2086,47 +2085,47 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"ldr s2, [x22, #0x0]\n"
"ldr s3, [x21, #0x0]\n"
"156:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q4, [x28, #0x0]\n"
- "ldr q5, [x28, #0x10]\n"
+ "ldr q5, [x28, #0x0]\n"
+ "ldr q4, [x28, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- "ldr q6, [x28, #0x20]\n"
- "ldr q7, [x28, #0x30]\n"
+ "ldr q7, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x40]\n"
- ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x50]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- "ldr q6, [x28, #0x60]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0x70]\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- "ldr q4, [x28, #0x80]\n"
- ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n"
- "ldr q5, [x28, #0x90]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
- "ldr q6, [x28, #0xa0]\n"
- ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n"
- "ldr q7, [x28, #0xb0]\n"
+ ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x28, #0x40]\n"
+ ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
+ "ldr q3, [x28, #0x60]\n"
+ ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n"
+ "ldr q4, [x28, #0x90]\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ "ldr q3, [x28, #0xa0]\n"
+ ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n"
+ "ldr q1, [x28, #0xb0]\n"
"add x28, x28, #0xc0\n"
- ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
- ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n"
- ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n"
- ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n"
"157:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -2415,7 +2414,6 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"174:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index 39ffcbef12..71e16d68b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
index 4993777d62..5693c3f397 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -93,7 +93,6 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
break;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 176f\n"
@@ -211,11 +210,11 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"16:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -233,23 +232,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"19:" // Height 1: Multiply loop: Main loop head
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x8\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
"add x10, x10, #0x80\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
"ldr q7, [x10, #0x10]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"ld1 { v0.4s }, [x26], #0x10\n"
@@ -257,20 +256,20 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"20:" // Height 1: Multiply loop: Single iteration only
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x4\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
+ ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x80\n"
"21:" // Height 1: Multiply loop: Main loop skip
@@ -284,23 +283,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr s0, [x26, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ec08 // bfmmla v8.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ec0c // bfmmla v12.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
+ ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
"add x10, x10, #0x80\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -314,17 +313,17 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp1 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "fmin v8.4s, v8.4s, v18.4s\n"
+ "fmin v9.4s, v9.4s, v18.4s\n"
+ "fmin v10.4s, v10.4s, v18.4s\n"
+ "fmin v11.4s, v11.4s, v18.4s\n"
+ "fmax v8.4s, v8.4s, v17.4s\n"
+ "fmax v9.4s, v9.4s, v17.4s\n"
+ "fmax v10.4s, v10.4s, v17.4s\n"
+ "fmax v11.4s, v11.4s, v17.4s\n"
"25:" // Height 1: No activation
"cmp x11, #0x10\n"
"bge 34f\n"
@@ -515,12 +514,12 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"51:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 52f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 53f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -528,7 +527,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"b 53f\n"
"52:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"53:" // Height 2: input setup done
"cmp x27, #0x4\n"
"blt 56f\n"
@@ -542,23 +541,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x8\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
"ldr q7, [x10, #0x10]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"ld1 { v0.4s }, [x26], #0x10\n"
@@ -569,20 +568,20 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q18, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x4\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
+ ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"add x10, x10, #0x80\n"
@@ -600,24 +599,24 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr s0, [x26, #0x0]\n"
"ldr s1, [x25, #0x0]\n"
"58:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e52ec08 // bfmmla v8.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e51ec0c // bfmmla v12.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n"
+ ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n"
"add x10, x10, #0x80\n"
"59:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -638,25 +637,25 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp2 v11.2d, v11.2d, v15.2d\n"
"tbz %x[flags], #1, 60f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v6.4s, v6.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmax v6.4s, v6.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "fmin v6.4s, v6.4s, v18.4s\n"
+ "fmin v12.4s, v12.4s, v18.4s\n"
+ "fmin v13.4s, v13.4s, v18.4s\n"
+ "fmin v14.4s, v14.4s, v18.4s\n"
+ "fmin v8.4s, v8.4s, v18.4s\n"
+ "fmin v9.4s, v9.4s, v18.4s\n"
+ "fmin v10.4s, v10.4s, v18.4s\n"
+ "fmin v11.4s, v11.4s, v18.4s\n"
+ "fmax v6.4s, v6.4s, v17.4s\n"
+ "fmax v12.4s, v12.4s, v17.4s\n"
+ "fmax v13.4s, v13.4s, v17.4s\n"
+ "fmax v14.4s, v14.4s, v17.4s\n"
+ "fmax v8.4s, v8.4s, v17.4s\n"
+ "fmax v9.4s, v9.4s, v17.4s\n"
+ "fmax v10.4s, v10.4s, v17.4s\n"
+ "fmax v11.4s, v11.4s, v17.4s\n"
"60:" // Height 2: No activation
"cmp x11, #0x10\n"
"bge 69f\n"
@@ -912,13 +911,13 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"86:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 87f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 88f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -927,8 +926,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"b 88f\n"
"87:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"88:" // Height 3: input setup done
"cmp x27, #0x4\n"
"blt 91f\n"
@@ -946,34 +945,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"sub x27, x27, #0x4\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q26, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
"cmp x27, #0x8\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
"ld1 { v1.4s }, [x25], #0x10\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
"ld1 { v0.4s }, [x26], #0x10\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
"ld1 { v2.4s }, [x24], #0x10\n"
"ldr q7, [x10, #0x10]\n"
"bge 89b\n"
@@ -984,30 +983,30 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"sub x27, x27, #0x4\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q26, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
+ ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
"91:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 94f\n"
"cbz x27, 94f\n"
@@ -1025,34 +1024,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr s1, [x25, #0x0]\n"
"ldr s2, [x24, #0x0]\n"
"93:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e5aec08 // bfmmla v8.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec50 // bfmmla v16.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ec0c // bfmmla v12.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec54 // bfmmla v20.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
+ ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
"94:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1078,33 +1077,33 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp1 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 95f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v6.4s, v6.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v6.4s, v6.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "fmin v6.4s, v6.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmax v6.4s, v6.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v25.4s\n"
+ "fmax v14.4s, v14.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v25.4s\n"
+ "fmax v9.4s, v9.4s, v25.4s\n"
+ "fmax v10.4s, v10.4s, v25.4s\n"
+ "fmax v11.4s, v11.4s, v25.4s\n"
+ "fmax v16.4s, v16.4s, v25.4s\n"
+ "fmax v17.4s, v17.4s, v25.4s\n"
+ "fmax v18.4s, v18.4s, v25.4s\n"
+ "fmax v19.4s, v19.4s, v25.4s\n"
"95:" // Height 3: No activation
"cmp x11, #0x10\n"
"bge 104f\n"
@@ -1401,14 +1400,14 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"121:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 122f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 123f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1418,9 +1417,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"b 123f\n"
"122:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"123:" // Height 4: input setup done
"cmp x27, #0x4\n"
"blt 126f\n"
@@ -1442,34 +1441,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q26, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
"ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
"ld1 { v3.4s }, [x23], #0x10\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
"ld1 { v0.4s }, [x26], #0x10\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
"ld1 { v2.4s }, [x24], #0x10\n"
"ldr q7, [x10, #0x10]\n"
"bge 124b\n"
@@ -1483,29 +1482,29 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q26, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
+ ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
"126:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 129f\n"
"cbz x27, 129f\n"
@@ -1526,35 +1525,35 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr s2, [x24, #0x0]\n"
"ldr s3, [x23, #0x0]\n"
"128:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e5aec08 // bfmmla v8.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec50 // bfmmla v16.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e59ec0c // bfmmla v12.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec54 // bfmmla v20.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n"
+ ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n"
+ ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n"
+ ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n"
"129:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1586,41 +1585,41 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"uzp2 v19.2d, v19.2d, v23.2d\n"
"tbz %x[flags], #1, 130f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v26.4s }, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1r { v0.4s }, [x20]\n"
- "fmin v6.4s, v6.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v1.4s\n"
- "fmin v13.4s, v13.4s, v1.4s\n"
- "fmin v14.4s, v14.4s, v1.4s\n"
- "fmin v8.4s, v8.4s, v1.4s\n"
- "fmin v9.4s, v9.4s, v1.4s\n"
- "fmin v10.4s, v10.4s, v1.4s\n"
- "fmin v11.4s, v11.4s, v1.4s\n"
- "fmin v15.4s, v15.4s, v1.4s\n"
- "fmin v20.4s, v20.4s, v1.4s\n"
- "fmin v21.4s, v21.4s, v1.4s\n"
- "fmin v22.4s, v22.4s, v1.4s\n"
- "fmin v16.4s, v16.4s, v1.4s\n"
- "fmin v17.4s, v17.4s, v1.4s\n"
- "fmin v18.4s, v18.4s, v1.4s\n"
- "fmin v19.4s, v19.4s, v1.4s\n"
- "fmax v6.4s, v6.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v0.4s\n"
- "fmax v13.4s, v13.4s, v0.4s\n"
- "fmax v14.4s, v14.4s, v0.4s\n"
- "fmax v8.4s, v8.4s, v0.4s\n"
- "fmax v9.4s, v9.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v0.4s\n"
- "fmax v15.4s, v15.4s, v0.4s\n"
- "fmax v20.4s, v20.4s, v0.4s\n"
- "fmax v21.4s, v21.4s, v0.4s\n"
- "fmax v22.4s, v22.4s, v0.4s\n"
- "fmax v16.4s, v16.4s, v0.4s\n"
- "fmax v17.4s, v17.4s, v0.4s\n"
- "fmax v18.4s, v18.4s, v0.4s\n"
- "fmax v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "fmin v6.4s, v6.4s, v26.4s\n"
+ "fmin v12.4s, v12.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v26.4s\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v8.4s, v8.4s, v26.4s\n"
+ "fmin v9.4s, v9.4s, v26.4s\n"
+ "fmin v10.4s, v10.4s, v26.4s\n"
+ "fmin v11.4s, v11.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmax v6.4s, v6.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v25.4s\n"
+ "fmax v14.4s, v14.4s, v25.4s\n"
+ "fmax v8.4s, v8.4s, v25.4s\n"
+ "fmax v9.4s, v9.4s, v25.4s\n"
+ "fmax v10.4s, v10.4s, v25.4s\n"
+ "fmax v11.4s, v11.4s, v25.4s\n"
+ "fmax v15.4s, v15.4s, v25.4s\n"
+ "fmax v20.4s, v20.4s, v25.4s\n"
+ "fmax v21.4s, v21.4s, v25.4s\n"
+ "fmax v22.4s, v22.4s, v25.4s\n"
+ "fmax v16.4s, v16.4s, v25.4s\n"
+ "fmax v17.4s, v17.4s, v25.4s\n"
+ "fmax v18.4s, v18.4s, v25.4s\n"
+ "fmax v19.4s, v19.4s, v25.4s\n"
"130:" // Height 4: No activation
"cmp x11, #0x10\n"
"bge 139f\n"
@@ -1982,15 +1981,15 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"156:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 157f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 158f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -2001,10 +2000,10 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"b 158f\n"
"157:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"158:" // Height 5: input setup done
"cmp x27, #0x4\n"
"blt 161f\n"
@@ -2029,43 +2028,43 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
"prfm pldl1keep, [x25, #0x80]\n"
"ld1 { v1.4s }, [x25], #0x10\n"
".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr q5, [x10, #0x30]\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
+ ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
"ldr q6, [x10, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"ld1 { v3.4s }, [x23], #0x10\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x6e45ec9d // bfmmla v29.4s, v4.8h, v5.8h\n"
+ "ldr q5, [x10, #0x50]\n"
".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec9e // bfmmla v30.4s, v4.8h, v5.8h\n"
+ "ldr q5, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e45ec0f // bfmmla v15.4s, v0.8h, v5.8h\n"
"ld1 { v0.4s }, [x26], #0x10\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
"ld1 { v2.4s }, [x24], #0x10\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e45ec9f // bfmmla v31.4s, v4.8h, v5.8h\n"
"ld1 { v4.4s }, [x22], #0x10\n"
"ldr q7, [x10, #0x10]\n"
"bge 159b\n"
@@ -2081,37 +2080,37 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x40]\n"
+ ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x60]\n"
+ ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n"
"161:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 164f\n"
"cbz x27, 164f\n"
@@ -2136,7 +2135,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr s4, [x22, #0x0]\n"
"163:" // Height 5: Multiply loop: Ragged operand read: Done
"ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
@@ -2145,34 +2144,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
+ "ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec9c // bfmmla v28.4s, v4.8h, v5.8h\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x40]\n"
+ ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x60]\n"
+ ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x70]\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n"
"164:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2658,16 +2657,16 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"191:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 192f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 193f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -2679,11 +2678,11 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"b 193f\n"
"192:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"193:" // Height 6: input setup done
"cmp x27, #0x4\n"
"blt 196f\n"
@@ -2716,7 +2715,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q5, [x10, #0x30]\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
@@ -2724,10 +2723,10 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ld1 { v3.4s }, [x23], #0x10\n"
".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
"ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec9d // bfmmla v29.4s, v4.8h, v5.8h\n"
"ldr q7, [x10, #0x50]\n"
"prfm pldl1keep, [x21, #0x80]\n"
"ld1 { v5.4s }, [x21], #0x10\n"
@@ -2766,37 +2765,37 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
"prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x20]\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q1, [x10, #0x30]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
"prfm pldl1keep, [x21, #0x80]\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x40]\n"
+ ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x60]\n"
+ ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n"
"196:" // Height 6: Multiply loop: Main loop skip
"cbz x27, 199f\n"
"cbz x27, 199f\n"
@@ -2823,45 +2822,45 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"ldr s4, [x22, #0x0]\n"
"ldr s5, [x21, #0x0]\n"
"198:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
+ "ldr q7, [x10, #0x0]\n"
+ "ldr q6, [x10, #0x10]\n"
".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n"
".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n"
- ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n"
+ "ldr q3, [x10, #0x20]\n"
+ ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x40]\n"
+ ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n"
+ "ldr q3, [x10, #0x60]\n"
+ ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
- ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n"
- ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n"
- ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n"
+ ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n"
+ ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n"
"199:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3126,7 +3125,6 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"212:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 905a60265c..bfc9c7e8f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -108,5 +108,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
index b31b80586c..eac0e7167e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
@@ -78,329 +78,328 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 91f\n"
"cmp %x[M], #0x2\n"
"bgt 61f\n"
"beq 31f\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v15.16b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[output_ptr]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
"3:" // Height 1: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "cbnz x12, 6f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
+ "add x9, x9, x20\n"
"b 6f\n"
"5:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
+ "mov x9, %x[input_ptr]\n"
"6:" // Height 1: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 11f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr d21, [x12, #0x70]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d20, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d26, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
- "mov v4.d[1], x9\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr d25, [x12, #0xa0]\n"
+ "mov v21.d[1], x20\n"
+ "ldr x20, [x12, #0x88]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
+ "ldr d24, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d23, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
- "mov v5.d[1], x28\n"
- "ldr x27, [x13, #0x98]\n"
- "mov v6.d[1], x27\n"
- "ldr x26, [x13, #0xa8]\n"
- "mov v7.d[1], x26\n"
- "ldr x25, [x13, #0xb8]\n"
- "mov v8.d[1], x25\n"
- "ldr x24, [x13, #0xc8]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "ldr x20, [x13, #0xd8]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- "ldr x9, [x13, #0xe8]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- "ldr x28, [x13, #0xf8]\n"
- "mov v9.d[1], x24\n"
- "mov v10.d[1], x20\n"
- "add x10, x10, #0x10\n"
- "mov v4.d[1], x9\n"
- "add x13, x13, #0x100\n"
- "mov v5.d[1], x28\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "ldr d22, [x12, #0xd0]\n"
+ ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr d21, [x12, #0xe0]\n"
+ "mov v20.d[1], x20\n"
+ "ldr x20, [x12, #0x98]\n"
+ "mov v26.d[1], x20\n"
+ "ldr x20, [x12, #0xa8]\n"
+ "mov v25.d[1], x20\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "mov v24.d[1], x20\n"
+ "ldr x23, [x12, #0xc8]\n"
+ ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr d20, [x12, #0xf0]\n"
+ ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
+ "ldr x22, [x12, #0xd8]\n"
+ ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
+ "ldr x21, [x12, #0xe8]\n"
+ ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
+ "ldr x20, [x12, #0xf8]\n"
+ "mov v23.d[1], x23\n"
+ "mov v22.d[1], x22\n"
+ "add x9, x9, #0x10\n"
+ "mov v21.d[1], x21\n"
+ "add x12, x12, #0x100\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q4, [x13, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q4, [x12, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q21, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q20, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q26, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q25, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q24, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q23, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "sub x11, x11, #0x10\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- "add x10, x10, #0x10\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "ldr q22, [x12, #0xd0]\n"
+ ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr q21, [x12, #0xe0]\n"
+ ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr q20, [x12, #0xf0]\n"
+ ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
+ "sub x10, x10, #0x10\n"
+ ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 10f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"10:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"11:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 18f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 18f\n"
+ "cmp x10, #0x4\n"
"blt 14f\n"
"12:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
"tbnz %x[flags], #31, 13f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q22, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q21, [x12, #0x20]\n"
+ ".inst 0x4f80e290 // sdot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
+ ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n"
+ ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x11, 18f\n"
- "tbz x11, #1, 15f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 16f\n"
- "ld1 { v0.b }[2], [x10]\n"
+ "cbz x10, 18f\n"
+ "tbz x10, #1, 15f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 16f\n"
+ "ld1 { v0.b }[2], [x9]\n"
"b 16f\n"
"15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
"16:" // Height 1: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 17f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
+ "ldr q20, [x12, #0x0]\n"
+ ".inst 0x4f80e290 // sdot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x10]\n"
+ ".inst 0x4f80e291 // sdot v17.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x20]\n"
+ ".inst 0x4f80e292 // sdot v18.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
+ ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 4b\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v1.4s }, [x23]\n"
- "neg v1.4s, v1.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "neg v20.4s, v20.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "mul v11.4s, v11.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v20.4s\n"
"19:" // Height 1: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q23, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q22, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v23.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v20.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 20f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v0.16b\n"
+ "and v21.16b, v18.16b, v0.16b\n"
+ "and v20.16b, v19.16b, v0.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"20:" // Height 1: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v20.4s\n"
+ "add v17.4s, v17.4s, v20.4s\n"
+ "add v18.4s, v18.4s, v20.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
- "cmp x15, #0x10\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 29f\n"
- "tbz x15, #3, 24f\n"
- "str d16, [x14], #0x8\n"
- "tbz x15, #2, 22f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "tbz x15, #1, 21f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[14], [x14]\n"
+ "tbz x14, #3, 24f\n"
+ "str d16, [x13], #0x8\n"
+ "tbz x14, #2, 22f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "tbz x14, #1, 21f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[14], [x13]\n"
"b 28f\n"
"21:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[12], [x14]\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[12], [x13]\n"
"b 28f\n"
"22:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 23f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[10], [x14]\n"
+ "tbz x14, #1, 23f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[10], [x13]\n"
"b 28f\n"
"23:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[8], [x14]\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[8], [x13]\n"
"b 28f\n"
"24:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 26f\n"
- "str s16, [x14], #0x4\n"
- "tbz x15, #1, 25f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[6], [x14]\n"
+ "tbz x14, #2, 26f\n"
+ "str s16, [x13], #0x4\n"
+ "tbz x14, #1, 25f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[6], [x13]\n"
"b 28f\n"
"25:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[4], [x14]\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[4], [x13]\n"
"b 28f\n"
"26:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 27f\n"
- "str h16, [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[2], [x14]\n"
+ "tbz x14, #1, 27f\n"
+ "str h16, [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[2], [x13]\n"
"b 28f\n"
"27:" // Height 1: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
"28:" // Height 1: Partial direct writeback: Done
"b 30f\n"
"29:" // Height 1: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
"30:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 2b\n"
"b 122f\n"
"31:" // Height 2
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v15.16b, #0x1\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -411,307 +410,307 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
"33:" // Height 2: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 35f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "cbnz x12, 36f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x11, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
- "add x23, x23, x20\n"
+ "add x9, x9, x20\n"
+ "add x28, x28, x20\n"
"b 36f\n"
"35:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x23, x10, x20\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
"36:" // Height 2: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 41f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x23, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 39f\n"
"37:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
+ "ldr d25, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "mov v4.d[1], x9\n"
+ "mov v25.d[1], x20\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d24, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d30, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
- "ldr x26, [x13, #0xa8]\n"
+ "ldr d29, [x12, #0xa0]\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
- "ldr x25, [x13, #0xb8]\n"
+ "ldr d28, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d27, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "mov v5.d[1], x28\n"
+ "mov v24.d[1], x23\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "mov v6.d[1], x27\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
- "mov v7.d[1], x26\n"
- "ldr x24, [x13, #0xc8]\n"
- "mov v8.d[1], x25\n"
- "ldr x20, [x13, #0xd8]\n"
- "ldr x9, [x13, #0xe8]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
- "ldr x28, [x13, #0xf8]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- "mov v9.d[1], x24\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- "mov v10.d[1], x20\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- "mov v4.d[1], x9\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- "mov v5.d[1], x28\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- "add x10, x10, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x13, x13, #0x100\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "ldr d26, [x12, #0xd0]\n"
+ ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n"
+ "mov v30.d[1], x22\n"
+ ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr d25, [x12, #0xe0]\n"
+ "mov v29.d[1], x21\n"
+ "ldr x23, [x12, #0xc8]\n"
+ "mov v28.d[1], x20\n"
+ "ldr x22, [x12, #0xd8]\n"
+ "ldr x21, [x12, #0xe8]\n"
+ ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr d24, [x12, #0xf0]\n"
+ "ldr x20, [x12, #0xf8]\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ "mov v27.d[1], x23\n"
+ ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n"
+ "mov v26.d[1], x22\n"
+ ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ "add x28, x28, #0x10\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 38f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"38:" // Height 2: Multiply loop: unique 5: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"bge 37b\n"
"39:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q25, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q24, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q30, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q29, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q28, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q27, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "ldr q26, [x12, #0xd0]\n"
+ ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr q25, [x12, #0xe0]\n"
+ ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr q24, [x12, #0xf0]\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 40f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"40:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"41:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 48f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 48f\n"
+ "cmp x10, #0x4\n"
"blt 44f\n"
"42:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x23], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
"tbnz %x[flags], #31, 43f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ "ldr q27, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q26, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q25, [x12, #0x20]\n"
+ ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
+ ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n"
+ ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
+ ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"bge 42b\n"
"44:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x11, 48f\n"
- "tbz x11, #1, 45f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "tbz x11, #0, 46f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x23]\n"
+ "cbz x10, 48f\n"
+ "tbz x10, #1, 45f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x10, #0, 46f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
"b 46f\n"
"45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
"46:" // Height 2: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 47f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ "ldr q24, [x12, #0x0]\n"
+ ".inst 0x4f80e310 // sdot v16.4s, v24.16b, v0.4b[0]\n"
+ "ldr q26, [x12, #0x10]\n"
+ ".inst 0x4f81e314 // sdot v20.4s, v24.16b, v1.4b[0]\n"
+ "ldr q25, [x12, #0x20]\n"
+ ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
+ ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
+ ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x14, x20\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "add x23, x13, x20\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x23]\n"
- "neg v2.4s, v2.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "neg v24.4s, v24.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "mul v11.4s, v11.4s, v2.4s\n"
- "mul v12.4s, v12.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v24.4s\n"
+ "mul v12.4s, v12.4s, v24.4s\n"
"49:" // Height 2: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q27, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q26, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v27.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v24.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 50f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
+ "and v30.16b, v17.16b, v0.16b\n"
+ "and v29.16b, v18.16b, v0.16b\n"
+ "and v28.16b, v19.16b, v0.16b\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v0.16b\n"
+ "and v25.16b, v22.16b, v0.16b\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v29.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sqadd v21.4s, v21.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
@@ -721,122 +720,122 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v24.4s\n"
+ "add v18.4s, v18.4s, v24.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v24.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v24.4s\n"
+ "smin v18.4s, v18.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v24.4s\n"
+ "smin v22.4s, v22.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
- "cmp x15, #0x10\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v17.8h, v22.8h, v23.8h\n"
+ "cmp x14, #0x10\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 59f\n"
- "tbz x15, #3, 54f\n"
- "str d16, [x14], #0x8\n"
- "str d20, [x22], #0x8\n"
- "tbz x15, #2, 52f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "tbz x15, #1, 51f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[14], [x14]\n"
- "st1 { v20.b }[14], [x22]\n"
+ "tbz x14, #3, 54f\n"
+ "str d16, [x13], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x14, #2, 52f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x14, #1, 51f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[14], [x13]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[12], [x14]\n"
- "st1 { v20.b }[12], [x22]\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[12], [x13]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 53f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[10], [x14]\n"
- "st1 { v20.b }[10], [x22]\n"
+ "tbz x14, #1, 53f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[10], [x13]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[8], [x14]\n"
- "st1 { v20.b }[8], [x22]\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[8], [x13]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 56f\n"
- "str s16, [x14], #0x4\n"
- "str s20, [x22], #0x4\n"
- "tbz x15, #1, 55f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[6], [x14]\n"
- "st1 { v20.b }[6], [x22]\n"
+ "tbz x14, #2, 56f\n"
+ "str s16, [x13], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x14, #1, 55f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[6], [x13]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[4], [x14]\n"
- "st1 { v20.b }[4], [x22]\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[4], [x13]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 57f\n"
- "str h16, [x14], #0x2\n"
- "str h20, [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[2], [x14]\n"
- "st1 { v20.b }[2], [x22]\n"
+ "tbz x14, #1, 57f\n"
+ "str h16, [x13], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[2], [x13]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
- "str b20, [x22, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q20, [x22, #0x0]\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 32b\n"
"b 122f\n"
"61:" // Height 3
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v15.16b, #0x1\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -851,317 +850,317 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
"63:" // Height 3: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 65f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "cbnz x12, 66f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "cbnz x11, 66f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
+ "add x9, x9, x20\n"
+ "add x28, x28, x20\n"
+ "add x27, x27, x20\n"
"b 66f\n"
"65:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x23, x10, x20\n"
- "add x22, x23, x20\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
+ "add x27, x28, x21\n"
"66:" // Height 3: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 71f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x23, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 69f\n"
"67:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
+ "ldr d29, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "mov v4.d[1], x9\n"
+ "mov v29.d[1], x20\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d28, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr x26, [x13, #0xa8]\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x25, [x13, #0xb8]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d5, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "mov v5.d[1], x28\n"
+ "mov v28.d[1], x23\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "mov v6.d[1], x27\n"
+ "mov v5.d[1], x22\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
+ "ldr d4, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "mov v7.d[1], x26\n"
+ "mov v4.d[1], x21\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr x24, [x13, #0xc8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
+ "ldr d3, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "mov v8.d[1], x25\n"
+ "mov v3.d[1], x20\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr x20, [x13, #0xd8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d31, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr x9, [x13, #0xe8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr x28, [x13, #0xf8]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "mov v9.d[1], x24\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "mov v10.d[1], x20\n"
- ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- "mov v4.d[1], x9\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- "add x10, x10, #0x10\n"
- ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "mov v5.d[1], x28\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "ldr d30, [x12, #0xd0]\n"
+ ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n"
+ "mov v31.d[1], x23\n"
+ ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n"
+ "mov v30.d[1], x22\n"
+ ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr d29, [x12, #0xe0]\n"
+ ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr d28, [x12, #0xf0]\n"
+ ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n"
+ "add x27, x27, #0x10\n"
+ ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 68f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"68:" // Height 3: Multiply loop: unique 9: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 67b\n"
"69:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q29, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q28, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q5, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q4, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q3, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q31, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "ldr q30, [x12, #0xd0]\n"
+ ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr q29, [x12, #0xe0]\n"
+ ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr q28, [x12, #0xf0]\n"
+ ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n"
+ ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 70f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"70:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"71:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 78f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 78f\n"
+ "cmp x10, #0x4\n"
"blt 74f\n"
"72:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
"tbnz %x[flags], #31, 73f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q30, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q29, [x12, #0x20]\n"
+ ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
+ ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n"
"bge 72b\n"
"74:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x11, 78f\n"
- "tbz x11, #1, 75f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "tbz x11, #0, 76f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "ld1 { v2.b }[2], [x22]\n"
+ "cbz x10, 78f\n"
+ "tbz x10, #1, 75f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "tbz x10, #0, 76f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
"b 76f\n"
"75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
"76:" // Height 3: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 77f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ ".inst 0x4f80e390 // sdot v16.4s, v28.16b, v0.4b[0]\n"
+ "ldr q30, [x12, #0x10]\n"
+ ".inst 0x4f81e394 // sdot v20.4s, v28.16b, v1.4b[0]\n"
+ "ldr q29, [x12, #0x20]\n"
+ ".inst 0x4f82e398 // sdot v24.4s, v28.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
+ ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n"
"78:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x14, x20\n"
- "add x21, x22, x20\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "add x23, x13, x20\n"
+ "add x22, x23, x20\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x23]\n"
- "neg v3.4s, v3.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "neg v28.4s, v28.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "mul v11.4s, v11.4s, v3.4s\n"
- "mul v12.4s, v12.4s, v3.4s\n"
- "mul v13.4s, v13.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v28.4s\n"
+ "mul v12.4s, v12.4s, v28.4s\n"
+ "mul v13.4s, v13.4s, v28.4s\n"
"79:" // Height 3: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q30, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1171,73 +1170,73 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"add v25.4s, v25.4s, v13.4s\n"
"add v26.4s, v26.4s, v13.4s\n"
"add v27.4s, v27.4s, v13.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v31.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v31.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v28.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 80f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
+ "and v1.16b, v16.16b, v0.16b\n"
+ "and v31.16b, v17.16b, v0.16b\n"
+ "and v30.16b, v18.16b, v0.16b\n"
+ "and v29.16b, v19.16b, v0.16b\n"
+ "and v28.16b, v20.16b, v0.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v29.4s\n"
+ "sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v0.16b\n"
+ "and v29.16b, v26.16b, v0.16b\n"
+ "and v28.16b, v27.16b, v0.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v3.4s\n"
+ "sqadd v22.4s, v22.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v1.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "sqadd v27.4s, v27.4s, v28.4s\n"
"80:" // Height 3: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
@@ -1251,156 +1250,156 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v28.4s\n"
+ "add v18.4s, v18.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v23.4s, v23.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v28.4s\n"
+ "smax v17.4s, v17.4s, v28.4s\n"
+ "smax v18.4s, v18.4s, v28.4s\n"
+ "smax v19.4s, v19.4s, v28.4s\n"
+ "smax v20.4s, v20.4s, v28.4s\n"
+ "smax v21.4s, v21.4s, v28.4s\n"
+ "smax v22.4s, v22.4s, v28.4s\n"
+ "smax v23.4s, v23.4s, v28.4s\n"
+ "smax v24.4s, v24.4s, v28.4s\n"
+ "smax v25.4s, v25.4s, v28.4s\n"
+ "smax v26.4s, v26.4s, v28.4s\n"
+ "smax v27.4s, v27.4s, v28.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "cmp x15, #0x10\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "cmp x14, #0x10\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 89f\n"
- "tbz x15, #3, 84f\n"
- "str d16, [x14], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "tbz x15, #2, 82f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "tbz x15, #1, 81f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[14], [x14]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "tbz x14, #3, 84f\n"
+ "str d16, [x13], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x14, #2, 82f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x14, #1, 81f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[14], [x13]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[12], [x14]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[12], [x13]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 83f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[10], [x14]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "tbz x14, #1, 83f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[10], [x13]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[8], [x14]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[8], [x13]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 86f\n"
- "str s16, [x14], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "tbz x15, #1, 85f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[6], [x14]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "tbz x14, #2, 86f\n"
+ "str s16, [x13], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x14, #1, 85f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[6], [x13]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[4], [x14]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[4], [x13]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 87f\n"
- "str h16, [x14], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[2], [x14]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "tbz x14, #1, 87f\n"
+ "str h16, [x13], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[2], [x13]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 62b\n"
"b 122f\n"
"91:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x4\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v14.4s, #0x0\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"movi v15.16b, #0x1\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x13, %x[output_ptr]\n"
"madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
@@ -1420,117 +1419,117 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
"93:" // Height 4: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 95f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
- "cbnz x12, 96f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x26, [x20, #0x18]\n"
+ "cbnz x11, 96f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
+ "add x9, x9, x20\n"
+ "add x28, x28, x20\n"
+ "add x27, x27, x20\n"
+ "add x26, x26, x20\n"
"b 96f\n"
"95:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x23, x10, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
+ "add x27, x28, x21\n"
+ "add x26, x27, x21\n"
"96:" // Height 4: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 101f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x23, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 99f\n"
"97:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr x22, [x12, #0x78]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr x21, [x12, #0x88]\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x20, [x12, #0x98]\n"
".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
+ "ldr d4, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "mov v4.d[1], x9\n"
+ "mov v4.d[1], x22\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x26, [x13, #0xa8]\n"
+ "ldr x25, [x12, #0xa8]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr x25, [x13, #0xb8]\n"
+ "ldr x24, [x12, #0xb8]\n"
".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d5, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "mov v5.d[1], x28\n"
+ "mov v5.d[1], x21\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x24, [x13, #0xc8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr x20, [x13, #0xd8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d6, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x27\n"
+ "mov v6.d[1], x20\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr x9, [x13, #0xe8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr x28, [x13, #0xf8]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
+ "ldr d7, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "mov v7.d[1], x26\n"
+ "mov v7.d[1], x25\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
+ "ldr d8, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "mov v8.d[1], x25\n"
+ "mov v8.d[1], x24\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "add x22, x22, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "add x21, x21, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d9, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "mov v9.d[1], x24\n"
+ "mov v9.d[1], x23\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
+ "ldr d10, [x12, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "mov v10.d[1], x20\n"
+ "mov v10.d[1], x22\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
+ "ldr d4, [x12, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- "mov v4.d[1], x9\n"
+ "mov v4.d[1], x21\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
+ "ldr d5, [x12, #0xf0]\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "mov v5.d[1], x28\n"
+ "mov v5.d[1], x20\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- "add x13, x13, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
@@ -1563,77 +1562,77 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"98:" // Height 4: Multiply loop: unique 13: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 97b\n"
"99:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q4, [x12, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q5, [x12, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q6, [x12, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q7, [x12, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q8, [x12, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q9, [x12, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
+ "ldr q10, [x12, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
+ "ldr q4, [x12, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
+ "ldr q5, [x12, #0xf0]\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1667,67 +1666,67 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"100:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"101:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 108f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 108f\n"
+ "cmp x10, #0x4\n"
"blt 104f\n"
"102:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s3, [x21], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
"tbnz %x[flags], #31, 103f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n"
+ "ldr q7, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q5, [x12, #0x20]\n"
+ ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
+ ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
"bge 102b\n"
"104:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x11, 108f\n"
- "tbz x11, #1, 105f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h3, [x21], #0x2\n"
- "tbz x11, #0, 106f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "ld1 { v2.b }[2], [x22]\n"
- "ld1 { v3.b }[2], [x21]\n"
+ "cbz x10, 108f\n"
+ "tbz x10, #1, 105f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v3.b }[2], [x26]\n"
"b 106f\n"
"105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
- "ldr b3, [x21, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
+ "ldr b3, [x26, #0x0]\n"
"106:" // Height 4: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 107f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
@@ -1735,64 +1734,64 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n"
+ "ldr q7, [x12, #0x0]\n"
+ ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x12, #0x10]\n"
+ ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x12, #0x20]\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
+ ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
"108:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x14, x20\n"
+ "add x23, x13, x20\n"
+ "add x22, x23, x20\n"
"add x21, x22, x20\n"
- "add x20, x21, x20\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "neg v4.4s, v4.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "neg v0.4s, v0.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "mul v12.4s, v12.4s, v4.4s\n"
- "mul v13.4s, v13.4s, v4.4s\n"
- "mul v14.4s, v14.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v0.4s\n"
+ "mul v12.4s, v12.4s, v0.4s\n"
+ "mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"109:" // Height 4: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q3, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q2, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q1, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q0, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1806,93 +1805,93 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"add v29.4s, v29.4s, v14.4s\n"
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v14.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v29.4s, v29.4s, v1.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v2.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v2.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v2.4s\n"
+ "add v30.4s, v30.4s, v1.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v1.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 110f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v2.16b, v16.16b, v0.16b\n"
+ "and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
+ "and v7.16b, v18.16b, v0.16b\n"
+ "and v6.16b, v19.16b, v0.16b\n"
+ "and v5.16b, v20.16b, v0.16b\n"
+ "and v4.16b, v21.16b, v0.16b\n"
+ "and v3.16b, v22.16b, v0.16b\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "and v9.16b, v28.16b, v0.16b\n"
- "and v10.16b, v29.16b, v0.16b\n"
- "and v4.16b, v30.16b, v0.16b\n"
- "and v5.16b, v31.16b, v0.16b\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "and v7.16b, v25.16b, v0.16b\n"
+ "and v6.16b, v26.16b, v0.16b\n"
+ "and v5.16b, v27.16b, v0.16b\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "and v3.16b, v29.16b, v0.16b\n"
+ "and v2.16b, v30.16b, v0.16b\n"
+ "and v1.16b, v31.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
- "sqadd v28.4s, v28.4s, v9.4s\n"
- "sqadd v29.4s, v29.4s, v10.4s\n"
- "sqadd v30.4s, v30.4s, v4.4s\n"
- "sqadd v31.4s, v31.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v5.4s\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v3.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
"110:" // Height 4: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
@@ -1910,172 +1909,172 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"srshl v29.4s, v29.4s, v0.4s\n"
"srshl v30.4s, v30.4s, v0.4s\n"
"srshl v31.4s, v31.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v0.4s\n"
+ "add v18.4s, v18.4s, v0.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v0.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v0.4s\n"
+ "smax v17.4s, v17.4s, v0.4s\n"
+ "smax v18.4s, v18.4s, v0.4s\n"
+ "smax v19.4s, v19.4s, v0.4s\n"
+ "smax v20.4s, v20.4s, v0.4s\n"
+ "smax v21.4s, v21.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v0.4s\n"
+ "smax v23.4s, v23.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v0.4s\n"
+ "smax v25.4s, v25.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v0.4s\n"
+ "smax v27.4s, v27.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v0.4s\n"
+ "smax v29.4s, v29.4s, v0.4s\n"
+ "smax v30.4s, v30.4s, v0.4s\n"
+ "smax v31.4s, v31.4s, v0.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v0.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v19.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v29.8h, v30.8h, v31.8h\n"
- "cmp x15, #0x10\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
- "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "uzp1 v17.8h, v30.8h, v31.8h\n"
+ "cmp x14, #0x10\n"
+ "uzp1 v16.16b, v16.16b, v0.16b\n"
+ "uzp1 v20.16b, v20.16b, v19.16b\n"
+ "uzp1 v24.16b, v24.16b, v18.16b\n"
+ "uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 119f\n"
- "tbz x15, #3, 114f\n"
- "str d16, [x14], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
- "tbz x15, #2, 112f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "st1 { v28.s }[2], [x20], #0x4\n"
- "tbz x15, #1, 111f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "st1 { v28.h }[6], [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[14], [x14]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
- "st1 { v28.b }[14], [x20]\n"
+ "tbz x14, #3, 114f\n"
+ "str d16, [x13], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x14, #2, 112f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x14, #1, 111f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[14], [x13]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[12], [x14]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
- "st1 { v28.b }[12], [x20]\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[12], [x13]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 113f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "st1 { v28.h }[4], [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[10], [x14]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
- "st1 { v28.b }[10], [x20]\n"
+ "tbz x14, #1, 113f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[10], [x13]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[8], [x14]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
- "st1 { v28.b }[8], [x20]\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[8], [x13]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 116f\n"
- "str s16, [x14], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "str s28, [x20], #0x4\n"
- "tbz x15, #1, 115f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "st1 { v28.h }[2], [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[6], [x14]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
- "st1 { v28.b }[6], [x20]\n"
+ "tbz x14, #2, 116f\n"
+ "str s16, [x13], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x14, #1, 115f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[6], [x13]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[4], [x14]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
- "st1 { v28.b }[4], [x20]\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[4], [x13]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 117f\n"
- "str h16, [x14], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "str h28, [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[2], [x14]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
- "st1 { v28.b }[2], [x20]\n"
+ "tbz x14, #1, 117f\n"
+ "str h16, [x13], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[2], [x13]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
- "str b28, [x20, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
- "str q28, [x20, #0x0]\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 92b\n"
"subs %x[M], %x[M], #0x4\n"
"beq 122f\n"
@@ -2089,10 +2088,9 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
index 485a47dc67..3b773a6827 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -78,7 +78,6 @@ void a64_hybrid_s8qa_dot_4x16 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 91f\n"
@@ -102,11 +101,11 @@ void a64_hybrid_s8qa_dot_4x16 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -128,32 +127,32 @@ void a64_hybrid_s8qa_dot_4x16 (
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q21, [x28, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q20, [x28, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q26, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q25, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q24, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q23, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q22, [x28, #0xd0]\n"
+ ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr q21, [x28, #0xe0]\n"
+ ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr q20, [x28, #0xf0]\n"
+ ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
"add x28, x28, #0x100\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
@@ -171,33 +170,33 @@ void a64_hybrid_s8qa_dot_4x16 (
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q21, [x28, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q20, [x28, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q26, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q25, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q24, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q23, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q22, [x28, #0xd0]\n"
+ ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr q21, [x28, #0xe0]\n"
+ ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr q20, [x28, #0xf0]\n"
+ ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n"
"sub x25, x25, #0x10\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n"
"add x24, x24, #0x10\n"
"add x28, x28, #0x100\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 10f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"10:" // Height 1: Multiply loop: unique 2: skip row sum
@@ -211,16 +210,16 @@ void a64_hybrid_s8qa_dot_4x16 (
"tbnz %x[flags], #31, 13f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q23, [x28, #0x0]\n"
+ "ldr q22, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q21, [x28, #0x20]\n"
+ "ldr q20, [x28, #0x30]\n"
+ ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n"
+ ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n"
+ ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
"add x28, x28, #0x40\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
@@ -236,14 +235,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"tbnz %x[flags], #31, 17f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ ".inst 0x4f80e2b0 // sdot v16.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f80e291 // sdot v17.4s, v20.16b, v0.4b[0]\n"
+ "ldr q21, [x28, #0x20]\n"
+ "ldr q20, [x28, #0x30]\n"
+ ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n"
"add x28, x28, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -252,72 +251,72 @@ void a64_hybrid_s8qa_dot_4x16 (
"bne 4b\n"
"prfm pstl1keep, [x27, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v1.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "neg v1.4s, v1.4s\n"
+ "neg v20.4s, v20.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "mul v11.4s, v11.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v20.4s\n"
"19:" // Height 1: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q23, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q22, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v20.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v23.4s\n"
+ "add v18.4s, v18.4s, v22.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v19.4s, v19.4s, v21.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v20.4s\n"
"add x10, x10, #0x40\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
"tbz %x[flags], #5, 20f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v0.16b\n"
+ "and v21.16b, v18.16b, v0.16b\n"
+ "and v20.16b, v19.16b, v0.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"20:" // Height 1: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v22.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v18.4s, v18.4s, v22.4s\n"
+ "add v19.4s, v19.4s, v22.4s\n"
"cmp x9, #0x10\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "smin v16.4s, v16.4s, v21.4s\n"
+ "smin v17.4s, v17.4s, v21.4s\n"
+ "smin v18.4s, v18.4s, v21.4s\n"
+ "smin v19.4s, v19.4s, v21.4s\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
@@ -397,12 +396,12 @@ void a64_hybrid_s8qa_dot_4x16 (
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 35f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -410,7 +409,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"b 36f\n"
"35:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"36:" // Height 2: input setup done
"cmp x25, #0x10\n"
"blt 41f\n"
@@ -428,48 +427,48 @@ void a64_hybrid_s8qa_dot_4x16 (
"37:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
"add x24, x24, #0x10\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"add x23, x23, #0x10\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr q24, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 38f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
@@ -491,49 +490,49 @@ void a64_hybrid_s8qa_dot_4x16 (
"39:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
"sub x25, x25, #0x10\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"add x24, x24, #0x10\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
"add x23, x23, #0x10\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr q24, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 40f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
@@ -551,21 +550,21 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q27, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n"
+ ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n"
+ ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
+ ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"bge 42b\n"
"44:" // Height 2: Multiply loop: Skip odd blocks
"cbz x25, 48f\n"
@@ -584,209 +583,209 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x4f80e310 // sdot v16.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e314 // sdot v20.4s, v24.16b, v1.4b[0]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n"
+ ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n"
+ ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x27, x20\n"
+ "add x23, x27, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "neg v2.4s, v2.4s\n"
+ "neg v24.4s, v24.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "mul v11.4s, v11.4s, v2.4s\n"
- "mul v12.4s, v12.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v24.4s\n"
+ "mul v12.4s, v12.4s, v24.4s\n"
"49:" // Height 2: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q27, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q26, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v24.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v27.4s\n"
"add x10, x10, #0x40\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v25.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v21.4s, v21.4s, v27.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v23.4s, v23.4s, v25.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v24.4s\n"
"tbz %x[flags], #5, 50f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
+ "and v30.16b, v17.16b, v0.16b\n"
+ "and v29.16b, v18.16b, v0.16b\n"
+ "and v28.16b, v19.16b, v0.16b\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v0.16b\n"
+ "and v25.16b, v22.16b, v0.16b\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v29.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sqadd v21.4s, v21.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"cmp x9, #0x10\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v26.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v26.4s\n"
+ "add v20.4s, v20.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v23.4s, v23.4s, v26.4s\n"
+ "smin v16.4s, v16.4s, v25.4s\n"
+ "smin v17.4s, v17.4s, v25.4s\n"
+ "smin v18.4s, v18.4s, v25.4s\n"
+ "smin v19.4s, v19.4s, v25.4s\n"
+ "smin v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v25.4s\n"
+ "smin v22.4s, v22.4s, v25.4s\n"
+ "smin v23.4s, v23.4s, v25.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v17.8h, v22.8h, v23.8h\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 59f\n"
"tbz x9, #3, 54f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x22], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x9, #2, 52f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x9, #1, 51f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 58f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 53f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 58f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 56f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x22], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 58f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 57f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x22], #0x2\n"
+ "str h20, [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x22, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x22, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 32b\n"
@@ -819,13 +818,13 @@ void a64_hybrid_s8qa_dot_4x16 (
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 65f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 66f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -834,8 +833,8 @@ void a64_hybrid_s8qa_dot_4x16 (
"b 66f\n"
"65:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"66:" // Height 3: input setup done
"cmp x25, #0x10\n"
"blt 71f\n"
@@ -857,62 +856,62 @@ void a64_hybrid_s8qa_dot_4x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q29, [x28, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
"add x22, x22, #0x10\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q28, [x28, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q5, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q4, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q3, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q31, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q30, [x28, #0xd0]\n"
+ ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr q29, [x28, #0xe0]\n"
+ ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr q28, [x28, #0xf0]\n"
+ ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n"
"add x28, x28, #0x100\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n"
+ ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 68f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
@@ -940,63 +939,63 @@ void a64_hybrid_s8qa_dot_4x16 (
"sub x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q29, [x28, #0x70]\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
"add x23, x23, #0x10\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q28, [x28, #0x80]\n"
"add x22, x22, #0x10\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q5, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q4, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q3, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q31, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q30, [x28, #0xd0]\n"
+ ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr q29, [x28, #0xe0]\n"
+ ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr q28, [x28, #0xf0]\n"
+ ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n"
"add x28, x28, #0x100\n"
- ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n"
+ ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 70f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
@@ -1018,25 +1017,25 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q31, [x28, #0x0]\n"
+ "ldr q30, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q28, [x28, #0x30]\n"
+ ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
+ ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n"
"bge 72b\n"
"74:" // Height 3: Multiply loop: Skip odd blocks
"cbz x25, 78f\n"
@@ -1059,144 +1058,144 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
+ "ldr q31, [x28, #0x0]\n"
+ "ldr q30, [x28, #0x10]\n"
+ ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q28, [x28, #0x30]\n"
+ ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n"
+ ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n"
"78:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "neg v3.4s, v3.4s\n"
+ "neg v28.4s, v28.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "mul v11.4s, v11.4s, v3.4s\n"
- "mul v12.4s, v12.4s, v3.4s\n"
- "mul v13.4s, v13.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v28.4s\n"
+ "mul v12.4s, v12.4s, v28.4s\n"
+ "mul v13.4s, v13.4s, v28.4s\n"
"79:" // Height 3: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q31, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q30, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v28.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add x10, x10, #0x40\n"
"add v26.4s, v26.4s, v13.4s\n"
"add v27.4s, v27.4s, v13.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v31.4s\n"
+ "add v18.4s, v18.4s, v30.4s\n"
+ "add v19.4s, v19.4s, v29.4s\n"
"add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v31.4s\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v31.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
"tbz %x[flags], #5, 80f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
+ "and v1.16b, v16.16b, v0.16b\n"
+ "and v31.16b, v17.16b, v0.16b\n"
+ "and v30.16b, v18.16b, v0.16b\n"
+ "and v29.16b, v19.16b, v0.16b\n"
+ "and v28.16b, v20.16b, v0.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v29.4s\n"
+ "sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v0.16b\n"
+ "and v29.16b, v26.16b, v0.16b\n"
+ "and v28.16b, v27.16b, v0.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v3.4s\n"
+ "sqadd v22.4s, v22.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v1.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "sqadd v27.4s, v27.4s, v28.4s\n"
"80:" // Height 3: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1204,132 +1203,132 @@ void a64_hybrid_s8qa_dot_4x16 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v30.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v30.4s\n"
+ "add v19.4s, v19.4s, v30.4s\n"
+ "add v20.4s, v20.4s, v30.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add v23.4s, v23.4s, v30.4s\n"
+ "add v24.4s, v24.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v30.4s\n"
+ "smin v16.4s, v16.4s, v29.4s\n"
+ "smin v17.4s, v17.4s, v29.4s\n"
+ "smin v18.4s, v18.4s, v29.4s\n"
+ "smin v19.4s, v19.4s, v29.4s\n"
+ "smin v20.4s, v20.4s, v29.4s\n"
+ "smin v21.4s, v21.4s, v29.4s\n"
+ "smin v22.4s, v22.4s, v29.4s\n"
+ "smin v23.4s, v23.4s, v29.4s\n"
+ "smin v24.4s, v24.4s, v29.4s\n"
+ "smin v25.4s, v25.4s, v29.4s\n"
+ "smin v26.4s, v26.4s, v29.4s\n"
+ "smin v27.4s, v27.4s, v29.4s\n"
+ "smax v16.4s, v16.4s, v28.4s\n"
+ "smax v17.4s, v17.4s, v28.4s\n"
+ "smax v18.4s, v18.4s, v28.4s\n"
+ "smax v19.4s, v19.4s, v28.4s\n"
+ "smax v20.4s, v20.4s, v28.4s\n"
+ "smax v21.4s, v21.4s, v28.4s\n"
+ "smax v22.4s, v22.4s, v28.4s\n"
+ "smax v23.4s, v23.4s, v28.4s\n"
+ "smax v24.4s, v24.4s, v28.4s\n"
+ "smax v25.4s, v25.4s, v28.4s\n"
+ "smax v26.4s, v26.4s, v28.4s\n"
+ "smax v27.4s, v27.4s, v28.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 89f\n"
"tbz x9, #3, 84f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 82f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 81f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 88f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 83f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 88f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 86f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 85f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 88f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 87f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 62b\n"
@@ -1370,14 +1369,14 @@ void a64_hybrid_s8qa_dot_4x16 (
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 95f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 96f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1387,9 +1386,9 @@ void a64_hybrid_s8qa_dot_4x16 (
"b 96f\n"
"95:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"96:" // Height 4: input setup done
"cmp x25, #0x10\n"
"blt 101f\n"
@@ -1614,29 +1613,29 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q7, [x28, #0x0]\n"
+ "ldr q6, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ "ldr q4, [x28, #0x30]\n"
+ ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
"bge 102b\n"
"104:" // Height 4: Multiply loop: Skip odd blocks
"cbz x25, 108f\n"
@@ -1663,73 +1662,73 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
"107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q7, [x28, #0x0]\n"
+ "ldr q6, [x28, #0x10]\n"
+ ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
"ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ "ldr q4, [x28, #0x30]\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
"108:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x20, x21, x20\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "neg v4.4s, v4.4s\n"
+ "neg v0.4s, v0.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "mul v12.4s, v12.4s, v4.4s\n"
- "mul v13.4s, v13.4s, v4.4s\n"
- "mul v14.4s, v14.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v0.4s\n"
+ "mul v12.4s, v12.4s, v0.4s\n"
+ "mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"109:" // Height 4: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q4, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add x10, x10, #0x40\n"
@@ -1740,100 +1739,100 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v14.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v2.4s\n"
"add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v2.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v2.4s\n"
"add v28.4s, v28.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v29.4s, v29.4s, v1.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v2.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
"tbz %x[flags], #5, 110f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v2.16b, v16.16b, v0.16b\n"
+ "and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
+ "and v7.16b, v18.16b, v0.16b\n"
+ "and v6.16b, v19.16b, v0.16b\n"
+ "and v5.16b, v20.16b, v0.16b\n"
+ "and v4.16b, v21.16b, v0.16b\n"
+ "and v3.16b, v22.16b, v0.16b\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "and v9.16b, v28.16b, v0.16b\n"
- "and v10.16b, v29.16b, v0.16b\n"
- "and v4.16b, v30.16b, v0.16b\n"
- "and v5.16b, v31.16b, v0.16b\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "and v7.16b, v25.16b, v0.16b\n"
+ "and v6.16b, v26.16b, v0.16b\n"
+ "and v5.16b, v27.16b, v0.16b\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "and v3.16b, v29.16b, v0.16b\n"
+ "and v2.16b, v30.16b, v0.16b\n"
+ "and v1.16b, v31.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
- "sqadd v28.4s, v28.4s, v9.4s\n"
- "sqadd v29.4s, v29.4s, v10.4s\n"
- "sqadd v30.4s, v30.4s, v4.4s\n"
- "sqadd v31.4s, v31.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v5.4s\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v3.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
"110:" // Height 4: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v1.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1845,163 +1844,163 @@ void a64_hybrid_s8qa_dot_4x16 (
"srshl v29.4s, v29.4s, v0.4s\n"
"srshl v30.4s, v30.4s, v0.4s\n"
"srshl v31.4s, v31.4s, v0.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "smin v16.4s, v16.4s, v2.4s\n"
+ "smin v17.4s, v17.4s, v2.4s\n"
+ "smin v18.4s, v18.4s, v2.4s\n"
+ "smin v19.4s, v19.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v2.4s\n"
+ "smin v21.4s, v21.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v2.4s\n"
+ "smin v23.4s, v23.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v2.4s\n"
+ "smin v25.4s, v25.4s, v2.4s\n"
+ "smin v26.4s, v26.4s, v2.4s\n"
+ "smin v27.4s, v27.4s, v2.4s\n"
+ "smin v28.4s, v28.4s, v2.4s\n"
+ "smin v29.4s, v29.4s, v2.4s\n"
+ "smin v30.4s, v30.4s, v2.4s\n"
+ "smin v31.4s, v31.4s, v2.4s\n"
+ "smax v16.4s, v16.4s, v1.4s\n"
+ "smax v17.4s, v17.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v1.4s\n"
+ "smax v19.4s, v19.4s, v1.4s\n"
+ "smax v20.4s, v20.4s, v1.4s\n"
+ "smax v21.4s, v21.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v1.4s\n"
+ "smax v23.4s, v23.4s, v1.4s\n"
+ "smax v24.4s, v24.4s, v1.4s\n"
+ "smax v25.4s, v25.4s, v1.4s\n"
+ "smax v26.4s, v26.4s, v1.4s\n"
+ "smax v27.4s, v27.4s, v1.4s\n"
+ "smax v28.4s, v28.4s, v1.4s\n"
+ "smax v29.4s, v29.4s, v1.4s\n"
+ "smax v30.4s, v30.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v1.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v0.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v19.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v29.8h, v30.8h, v31.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
- "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "uzp1 v17.8h, v30.8h, v31.8h\n"
+ "uzp1 v16.16b, v16.16b, v0.16b\n"
+ "uzp1 v20.16b, v20.16b, v19.16b\n"
+ "uzp1 v24.16b, v24.16b, v18.16b\n"
+ "uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 119f\n"
"tbz x9, #3, 114f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x9, #2, 112f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "st1 { v28.s }[2], [x20], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x9, #1, 111f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "st1 { v28.h }[6], [x20], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
- "st1 { v28.b }[14], [x20]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 118f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
- "st1 { v28.b }[12], [x20]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 113f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "st1 { v28.h }[4], [x20], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
- "st1 { v28.b }[10], [x20]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 118f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
- "st1 { v28.b }[8], [x20]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 116f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "str s28, [x20], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x9, #1, 115f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "st1 { v28.h }[2], [x20], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
- "st1 { v28.b }[6], [x20]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 118f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
- "st1 { v28.b }[4], [x20]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 117f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "str h28, [x20], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
- "st1 { v28.b }[2], [x20]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
- "str b28, [x20, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
- "str q28, [x20, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 92b\n"
@@ -2017,7 +2016,6 @@ void a64_hybrid_s8qa_dot_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index 69ea87bc9e..55ea68d1b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -98,5 +98,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
index 69d01a265e..883bd5afdd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
@@ -78,7 +78,6 @@ void a64_hybrid_s8qa_mmla_4x16 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 97f\n"
@@ -106,11 +105,11 @@ void a64_hybrid_s8qa_mmla_4x16 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -131,35 +130,35 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr q4, [x28, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v0.2d, v1.2d, v27.2d\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
+ "ldr q25, [x28, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v27.2d\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
- ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
+ ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
"add x28, x28, #0x100\n"
- ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
+ ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
+ ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n"
@@ -177,36 +176,36 @@ void a64_hybrid_s8qa_mmla_4x16 (
"prfm pldl1keep, [x24, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v0.2d, v1.2d, v24.2d\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
+ "ldr q25, [x28, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v24.2d\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
"sub x25, x25, #0x10\n"
- ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
+ ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
"add x24, x24, #0x10\n"
"add x28, x28, #0x100\n"
- ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
+ ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
+ ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 10f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n"
@@ -217,29 +216,29 @@ void a64_hybrid_s8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 14f\n"
"12:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x24], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "trn1 v0.2d, v25.2d, v24.2d\n"
"tbnz %x[flags], #31, 13f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
- ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e9aa414 // smmla v20.4s, v0.16b, v26.16b\n"
+ "ldr q27, [x28, #0x40]\n"
+ "ldr q26, [x28, #0x50]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n"
+ ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n"
+ ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
@@ -264,26 +263,26 @@ void a64_hybrid_s8qa_mmla_4x16 (
"17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x24, #0x0]\n"
"18:" // Height 1: Multiply loop: Ragged operand read: Done
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v0.2d, v1.2d, v24.2d\n"
"tbnz %x[flags], #31, 19f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"19:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
- ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x4e99a410 // smmla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a414 // smmla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x4e99a412 // smmla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a416 // smmla v22.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"20:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -297,75 +296,75 @@ void a64_hybrid_s8qa_mmla_4x16 (
"uzp1 v19.2d, v19.2d, v23.2d\n"
"mov v23.16b, v16.16b\n"
"tbnz %x[flags], #31, 21f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v1.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "neg v1.4s, v1.4s\n"
+ "neg v16.4s, v16.4s\n"
"dup v11.4s, v11.s[0]\n"
- "mul v11.4s, v11.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v16.4s\n"
"21:" // Height 1: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x10]\n"
"add v23.4s, v23.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "ldr q20, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v16.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v16.4s\n"
"add x10, x10, #0x40\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v16.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
"tbz %x[flags], #5, 22f\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v22.16b, v23.16b, v0.16b\n"
+ "and v21.16b, v17.16b, v0.16b\n"
+ "and v20.16b, v18.16b, v0.16b\n"
+ "and v16.16b, v19.16b, v0.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v22.4s\n"
+ "sqadd v17.4s, v17.4s, v21.4s\n"
+ "sqadd v18.4s, v18.4s, v20.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
"22:" // Height 1: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v23.4s, v23.4s, v21.4s\n"
+ "add v17.4s, v17.4s, v21.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v21.4s\n"
"cmp x9, #0x10\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "smax v23.4s, v23.4s, v16.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
"uzp1 v23.8h, v23.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v23.16b, v23.16b, v17.16b\n"
+ "uzp1 v16.8h, v18.8h, v19.8h\n"
+ "uzp1 v23.16b, v23.16b, v16.16b\n"
"bge 31f\n"
"tbz x9, #3, 26f\n"
"str d23, [x27], #0x8\n"
@@ -442,12 +441,12 @@ void a64_hybrid_s8qa_mmla_4x16 (
"36:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 37f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 38f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -455,7 +454,7 @@ void a64_hybrid_s8qa_mmla_4x16 (
"b 38f\n"
"37:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"38:" // Height 2: input setup done
"cmp x25, #0x10\n"
"blt 43f\n"
@@ -473,34 +472,34 @@ void a64_hybrid_s8qa_mmla_4x16 (
"39:" // Height 2: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
- ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
+ ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
"add x23, x23, #0x10\n"
- ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n"
+ ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
"add x28, x28, #0x100\n"
- ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
+ ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 40f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n"
@@ -522,35 +521,35 @@ void a64_hybrid_s8qa_mmla_4x16 (
"41:" // Height 2: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
"sub x25, x25, #0x10\n"
- ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n"
+ ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n"
+ ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n"
+ ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n"
"add x28, x28, #0x100\n"
- ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 42f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n"
@@ -562,30 +561,30 @@ void a64_hybrid_s8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 46f\n"
"44:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "trn1 v0.2d, v25.2d, v24.2d\n"
"tbnz %x[flags], #31, 45f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"45:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
- ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e9aa414 // smmla v20.4s, v0.16b, v26.16b\n"
+ "ldr q27, [x28, #0x40]\n"
+ "ldr q26, [x28, #0x50]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n"
+ ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n"
+ ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"bge 44b\n"
"46:" // Height 2: Multiply loop: Skip odd blocks
@@ -621,22 +620,22 @@ void a64_hybrid_s8qa_mmla_4x16 (
"tbnz %x[flags], #31, 51f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
"51:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
- ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x4e99a410 // smmla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a414 // smmla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x4e99a412 // smmla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a416 // smmla v22.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"52:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -644,127 +643,127 @@ void a64_hybrid_s8qa_mmla_4x16 (
"cmp x26, x20\n"
"bne 36b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v4.2d, v16.2d, v20.2d\n"
- "add x22, x27, x20\n"
+ "uzp1 v24.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "mov v23.16b, v4.16b\n"
+ "mov v23.16b, v24.16b\n"
"tbnz %x[flags], #31, 53f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "neg v2.4s, v2.4s\n"
+ "neg v24.4s, v24.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
- "mul v11.4s, v11.4s, v2.4s\n"
- "mul v12.4s, v12.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v24.4s\n"
+ "mul v12.4s, v12.4s, v24.4s\n"
"53:" // Height 2: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q27, [x10, #0x10]\n"
"add v23.4s, v23.4s, v11.4s\n"
"add v20.4s, v20.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q26, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x30]\n"
"add v21.4s, v21.4s, v11.4s\n"
"add v22.4s, v22.4s, v11.4s\n"
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v24.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
"add x10, x10, #0x40\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v17.4s, v17.4s, v27.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v25.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
"tbz %x[flags], #5, 54f\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "and v6.16b, v21.16b, v0.16b\n"
- "and v7.16b, v22.16b, v0.16b\n"
- "and v8.16b, v16.16b, v0.16b\n"
- "and v9.16b, v17.16b, v0.16b\n"
- "and v10.16b, v18.16b, v0.16b\n"
- "and v4.16b, v19.16b, v0.16b\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "sqadd v16.4s, v16.4s, v8.4s\n"
- "sqadd v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
+ "and v30.16b, v20.16b, v0.16b\n"
+ "and v29.16b, v21.16b, v0.16b\n"
+ "and v28.16b, v22.16b, v0.16b\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v0.16b\n"
+ "and v25.16b, v18.16b, v0.16b\n"
+ "and v24.16b, v19.16b, v0.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "sqadd v22.4s, v22.4s, v28.4s\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "sqadd v18.4s, v18.4s, v25.4s\n"
+ "sqadd v19.4s, v19.4s, v24.4s\n"
"54:" // Height 2: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"cmp x9, #0x10\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add v23.4s, v23.4s, v26.4s\n"
+ "add v20.4s, v20.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v16.4s, v16.4s, v26.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v26.4s\n"
+ "smin v23.4s, v23.4s, v25.4s\n"
+ "smin v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v25.4s\n"
+ "smin v22.4s, v22.4s, v25.4s\n"
+ "smin v16.4s, v16.4s, v25.4s\n"
+ "smin v17.4s, v17.4s, v25.4s\n"
+ "smin v18.4s, v18.4s, v25.4s\n"
+ "smin v19.4s, v19.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
"uzp1 v23.8h, v23.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
@@ -774,68 +773,68 @@ void a64_hybrid_s8qa_mmla_4x16 (
"bge 63f\n"
"tbz x9, #3, 58f\n"
"str d23, [x27], #0x8\n"
- "str d16, [x22], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x9, #2, 56f\n"
"st1 { v23.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v23.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x22]\n"
+ "st1 { v16.b }[14], [x23]\n"
"b 62f\n"
"55:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 62f\n"
"st1 { v23.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x22]\n"
+ "st1 { v16.b }[12], [x23]\n"
"b 62f\n"
"56:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 57f\n"
"st1 { v23.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x22]\n"
+ "st1 { v16.b }[10], [x23]\n"
"b 62f\n"
"57:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 62f\n"
"st1 { v23.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x22]\n"
+ "st1 { v16.b }[8], [x23]\n"
"b 62f\n"
"58:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 60f\n"
"str s23, [x27], #0x4\n"
- "str s16, [x22], #0x4\n"
+ "str s16, [x23], #0x4\n"
"tbz x9, #1, 59f\n"
"st1 { v23.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x22]\n"
+ "st1 { v16.b }[6], [x23]\n"
"b 62f\n"
"59:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 62f\n"
"st1 { v23.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x22]\n"
+ "st1 { v16.b }[4], [x23]\n"
"b 62f\n"
"60:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 61f\n"
"str h23, [x27], #0x2\n"
- "str h16, [x22], #0x2\n"
+ "str h16, [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x22]\n"
+ "st1 { v16.b }[2], [x23]\n"
"b 62f\n"
"61:" // Height 2: Partial direct writeback: partial_1_0
"str b23, [x27, #0x0]\n"
- "str b16, [x22, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
"62:" // Height 2: Partial direct writeback: Done
"b 64f\n"
"63:" // Height 2: Full writeback
"str q23, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x22, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
"64:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 34b\n"
@@ -872,13 +871,13 @@ void a64_hybrid_s8qa_mmla_4x16 (
"68:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 69f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -887,8 +886,8 @@ void a64_hybrid_s8qa_mmla_4x16 (
"b 70f\n"
"69:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"70:" // Height 3: input setup done
"cmp x25, #0x10\n"
"blt 75f\n"
@@ -909,12 +908,12 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q4, [x28, #0x60]\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q4, [x28, #0x80]\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
@@ -930,15 +929,15 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n"
".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
- ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e85a413 // smmla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a45b // smmla v27.4s, v2.16b, v5.16b\n"
+ "ldr q6, [x28, #0xd0]\n"
+ ".inst 0x4e8ea417 // smmla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x4e8ea45f // smmla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n"
+ "ldr q4, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n"
".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n"
@@ -948,12 +947,12 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n"
".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n"
".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e86a436 // smmla v22.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e84a437 // smmla v23.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n"
"tbnz %x[flags], #31, 72f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
@@ -981,12 +980,12 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q4, [x28, #0x60]\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q4, [x28, #0x80]\n"
".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
@@ -1003,15 +1002,15 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
"add x22, x22, #0x10\n"
- ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e85a413 // smmla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a45b // smmla v27.4s, v2.16b, v5.16b\n"
+ "ldr q6, [x28, #0xd0]\n"
+ ".inst 0x4e8ea417 // smmla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x4e8ea45f // smmla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n"
+ "ldr q4, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n"
".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n"
@@ -1021,12 +1020,12 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n"
".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n"
".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e86a436 // smmla v22.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e84a437 // smmla v23.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n"
"tbnz %x[flags], #31, 74f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
@@ -1042,41 +1041,41 @@ void a64_hybrid_s8qa_mmla_4x16 (
"blt 78f\n"
"76:" // Height 3: Multiply loop: Odd block loop
"ldr d1, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x22], #0x8\n"
- "trn1 v2.2d, v3.2d, v7.2d\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x22], #0x8\n"
+ "trn1 v2.2d, v1.2d, v2.2d\n"
"tbnz %x[flags], #31, 77f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x4e83a410 // smmla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n"
+ "ldr q7, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
"sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n"
- ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n"
- ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a414 // smmla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
"add x28, x28, #0x80\n"
".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n"
- ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n"
+ ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n"
"bge 76b\n"
"78:" // Height 3: Multiply loop: Skip odd blocks
"cbz x25, 84f\n"
@@ -1115,52 +1114,52 @@ void a64_hybrid_s8qa_mmla_4x16 (
"ldr b3, [x22, #0x0]\n"
"82:" // Height 3: Multiply loop: Ragged operand read: Done
"trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v9.2d\n"
+ "trn1 v2.2d, v3.2d, v4.2d\n"
"tbnz %x[flags], #31, 83f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"83:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n"
- "ldr q5, [x28, #0x20]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
- ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
+ ".inst 0x4e83a414 // smmla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45c // smmla v28.4s, v2.16b, v3.16b\n"
+ "ldr q5, [x28, #0x40]\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
"add x28, x28, #0x80\n"
- ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n"
- ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n"
- ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n"
+ ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n"
"84:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 68b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v4.2d, v16.2d, v20.2d\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x21, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
@@ -1168,116 +1167,116 @@ void a64_hybrid_s8qa_mmla_4x16 (
"uzp1 v25.2d, v25.2d, v29.2d\n"
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v4.16b\n"
+ "mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 85f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "neg v3.4s, v3.4s\n"
+ "neg v23.4s, v23.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
"dup v13.4s, v13.s[0]\n"
- "mul v11.4s, v11.4s, v3.4s\n"
- "mul v12.4s, v12.4s, v3.4s\n"
- "mul v13.4s, v13.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v23.4s\n"
+ "mul v12.4s, v12.4s, v23.4s\n"
+ "mul v13.4s, v13.4s, v23.4s\n"
"85:" // Height 3: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q30, [x10, #0x10]\n"
"add v31.4s, v31.4s, v11.4s\n"
"add v20.4s, v20.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "ldr q28, [x10, #0x30]\n"
"add v21.4s, v21.4s, v11.4s\n"
"add v22.4s, v22.4s, v11.4s\n"
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v23.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add x10, x10, #0x40\n"
"add v26.4s, v26.4s, v13.4s\n"
"add v27.4s, v27.4s, v13.4s\n"
"add v31.4s, v31.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v30.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v23.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v23.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v23.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v23.4s\n"
"tbz %x[flags], #5, 86f\n"
- "and v4.16b, v31.16b, v0.16b\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "and v6.16b, v21.16b, v0.16b\n"
- "and v7.16b, v22.16b, v0.16b\n"
- "and v8.16b, v16.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "sqadd v16.4s, v16.4s, v8.4s\n"
- "and v9.16b, v17.16b, v0.16b\n"
- "and v10.16b, v18.16b, v0.16b\n"
- "and v4.16b, v19.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
+ "and v1.16b, v31.16b, v0.16b\n"
+ "and v30.16b, v20.16b, v0.16b\n"
+ "and v29.16b, v21.16b, v0.16b\n"
+ "and v28.16b, v22.16b, v0.16b\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "sqadd v20.4s, v20.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "sqadd v22.4s, v22.4s, v28.4s\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "and v3.16b, v17.16b, v0.16b\n"
+ "and v2.16b, v18.16b, v0.16b\n"
+ "and v1.16b, v19.16b, v0.16b\n"
+ "and v30.16b, v24.16b, v0.16b\n"
+ "and v29.16b, v25.16b, v0.16b\n"
+ "and v28.16b, v26.16b, v0.16b\n"
+ "and v23.16b, v27.16b, v0.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "sqadd v24.4s, v24.4s, v30.4s\n"
+ "sqadd v25.4s, v25.4s, v29.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v27.4s, v27.4s, v23.4s\n"
"86:" // Height 3: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v23.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1285,132 +1284,132 @@ void a64_hybrid_s8qa_mmla_4x16 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v31.4s, v31.4s, v29.4s\n"
+ "add v20.4s, v20.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v16.4s, v16.4s, v29.4s\n"
+ "add v17.4s, v17.4s, v29.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "smin v31.4s, v31.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
+ "smax v31.4s, v31.4s, v23.4s\n"
+ "smax v20.4s, v20.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v23.4s\n"
+ "smax v22.4s, v22.4s, v23.4s\n"
+ "smax v16.4s, v16.4s, v23.4s\n"
+ "smax v17.4s, v17.4s, v23.4s\n"
+ "smax v18.4s, v18.4s, v23.4s\n"
+ "smax v19.4s, v19.4s, v23.4s\n"
+ "smax v24.4s, v24.4s, v23.4s\n"
+ "smax v25.4s, v25.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v23.4s\n"
+ "smax v27.4s, v27.4s, v23.4s\n"
"uzp1 v31.8h, v31.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
"uzp1 v31.16b, v31.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 95f\n"
"tbz x9, #3, 90f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 88f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 87f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 94f\n"
"87:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 94f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 94f\n"
"88:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 89f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 94f\n"
"89:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 94f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 94f\n"
"90:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 92f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 91f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 94f\n"
"91:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 94f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 94f\n"
"92:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 93f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 94f\n"
"93:" // Height 3: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"94:" // Height 3: Partial direct writeback: Done
"b 96f\n"
"95:" // Height 3: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"96:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 66b\n"
@@ -1451,14 +1450,14 @@ void a64_hybrid_s8qa_mmla_4x16 (
"100:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 101f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 102f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1468,9 +1467,9 @@ void a64_hybrid_s8qa_mmla_4x16 (
"b 102f\n"
"101:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"102:" // Height 4: input setup done
"cmp x25, #0x10\n"
"blt 107f\n"
@@ -1630,42 +1629,42 @@ void a64_hybrid_s8qa_mmla_4x16 (
"blt 110f\n"
"108:" // Height 4: Multiply loop: Odd block loop
"ldr d1, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v2.2d, v3.2d, v7.2d\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
+ "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x21], #0x8\n"
+ "trn1 v2.2d, v2.2d, v1.2d\n"
"tbnz %x[flags], #31, 109f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"109:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x4e83a410 // smmla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n"
+ "ldr q7, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
"sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n"
- ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n"
- ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a414 // smmla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
"add x28, x28, #0x80\n"
".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n"
- ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n"
+ ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n"
"bge 108b\n"
"110:" // Height 4: Multiply loop: Skip odd blocks
"cbz x25, 116f\n"
@@ -1716,51 +1715,51 @@ void a64_hybrid_s8qa_mmla_4x16 (
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
"115:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n"
- "ldr q5, [x28, #0x20]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
- ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n"
- ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
+ ".inst 0x4e83a414 // smmla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45c // smmla v28.4s, v2.16b, v3.16b\n"
+ "ldr q5, [x28, #0x40]\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n"
".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n"
"add x28, x28, #0x80\n"
- ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n"
- ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n"
- ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n"
- ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n"
- ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n"
+ ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n"
"116:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 100b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v4.2d, v16.2d, v20.2d\n"
- "add x22, x27, x20\n"
+ "uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"add x21, x22, x20\n"
- "add x20, x21, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"prfm pstl1keep, [x27, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x20, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
"uzp2 v24.2d, v24.2d, v28.2d\n"
@@ -1770,38 +1769,38 @@ void a64_hybrid_s8qa_mmla_4x16 (
"uzp2 v26.2d, v26.2d, v30.2d\n"
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v4.16b\n"
+ "mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 117f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "neg v4.4s, v4.4s\n"
+ "neg v0.4s, v0.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
"dup v14.4s, v13.s[3]\n"
"dup v13.4s, v13.s[0]\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "mul v12.4s, v12.4s, v4.4s\n"
- "mul v13.4s, v13.4s, v4.4s\n"
- "mul v14.4s, v14.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v0.4s\n"
+ "mul v12.4s, v12.4s, v0.4s\n"
+ "mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"117:" // Height 4: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q4, [x10, #0x10]\n"
"add v31.4s, v31.4s, v11.4s\n"
"add v20.4s, v20.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
"add v21.4s, v21.4s, v11.4s\n"
"add v22.4s, v22.4s, v11.4s\n"
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v23.4s, v23.4s, v13.4s\n"
"add v28.4s, v28.4s, v13.4s\n"
"add x10, x10, #0x40\n"
@@ -1812,100 +1811,100 @@ void a64_hybrid_s8qa_mmla_4x16 (
"add v26.4s, v26.4s, v14.4s\n"
"add v27.4s, v27.4s, v14.4s\n"
"add v31.4s, v31.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v2.4s\n"
"add v23.4s, v23.4s, v0.4s\n"
- "add v28.4s, v28.4s, v1.4s\n"
- "add v29.4s, v29.4s, v2.4s\n"
- "add v30.4s, v30.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v1.4s\n"
"tbz %x[flags], #5, 118f\n"
- "and v4.16b, v31.16b, v0.16b\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "and v6.16b, v21.16b, v0.16b\n"
- "and v7.16b, v22.16b, v0.16b\n"
- "and v8.16b, v16.16b, v0.16b\n"
- "and v9.16b, v17.16b, v0.16b\n"
- "and v10.16b, v18.16b, v0.16b\n"
- "and v4.16b, v19.16b, v0.16b\n"
- "and v5.16b, v23.16b, v0.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v2.16b, v31.16b, v0.16b\n"
+ "and v1.16b, v20.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v2.4s\n"
+ "sqadd v20.4s, v20.4s, v1.4s\n"
+ "and v7.16b, v21.16b, v0.16b\n"
+ "and v6.16b, v22.16b, v0.16b\n"
+ "and v5.16b, v16.16b, v0.16b\n"
+ "and v4.16b, v17.16b, v0.16b\n"
+ "and v3.16b, v18.16b, v0.16b\n"
+ "and v2.16b, v19.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "sqadd v16.4s, v16.4s, v8.4s\n"
- "sqadd v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sqadd v23.4s, v23.4s, v5.4s\n"
- "and v6.16b, v28.16b, v0.16b\n"
- "and v7.16b, v29.16b, v0.16b\n"
- "and v8.16b, v30.16b, v0.16b\n"
- "and v9.16b, v24.16b, v0.16b\n"
- "and v10.16b, v25.16b, v0.16b\n"
- "and v4.16b, v26.16b, v0.16b\n"
- "and v5.16b, v27.16b, v0.16b\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v7.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "sqadd v19.4s, v19.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v1.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
+ "and v6.16b, v29.16b, v0.16b\n"
+ "and v5.16b, v30.16b, v0.16b\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "and v3.16b, v25.16b, v0.16b\n"
+ "and v2.16b, v26.16b, v0.16b\n"
+ "and v1.16b, v27.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v6.4s\n"
- "sqadd v29.4s, v29.4s, v7.4s\n"
- "sqadd v30.4s, v30.4s, v8.4s\n"
- "sqadd v24.4s, v24.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v4.4s\n"
- "sqadd v27.4s, v27.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v7.4s\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "sqadd v30.4s, v30.4s, v5.4s\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "sqadd v25.4s, v25.4s, v3.4s\n"
+ "sqadd v26.4s, v26.4s, v2.4s\n"
+ "sqadd v27.4s, v27.4s, v1.4s\n"
"118:" // Height 4: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v1.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1917,163 +1916,163 @@ void a64_hybrid_s8qa_mmla_4x16 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "smin v31.4s, v31.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v2.4s\n"
+ "smin v21.4s, v21.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v2.4s\n"
+ "smin v17.4s, v17.4s, v2.4s\n"
+ "smin v18.4s, v18.4s, v2.4s\n"
+ "smin v19.4s, v19.4s, v2.4s\n"
+ "smin v23.4s, v23.4s, v2.4s\n"
+ "smin v28.4s, v28.4s, v2.4s\n"
+ "smin v29.4s, v29.4s, v2.4s\n"
+ "smin v30.4s, v30.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v2.4s\n"
+ "smin v25.4s, v25.4s, v2.4s\n"
+ "smin v26.4s, v26.4s, v2.4s\n"
+ "smin v27.4s, v27.4s, v2.4s\n"
+ "smax v31.4s, v31.4s, v1.4s\n"
+ "smax v20.4s, v20.4s, v1.4s\n"
+ "smax v21.4s, v21.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v1.4s\n"
+ "smax v16.4s, v16.4s, v1.4s\n"
+ "smax v17.4s, v17.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v1.4s\n"
+ "smax v19.4s, v19.4s, v1.4s\n"
+ "smax v23.4s, v23.4s, v1.4s\n"
+ "smax v28.4s, v28.4s, v1.4s\n"
+ "smax v29.4s, v29.4s, v1.4s\n"
+ "smax v30.4s, v30.4s, v1.4s\n"
+ "smax v24.4s, v24.4s, v1.4s\n"
+ "smax v25.4s, v25.4s, v1.4s\n"
+ "smax v26.4s, v26.4s, v1.4s\n"
+ "smax v27.4s, v27.4s, v1.4s\n"
"uzp1 v31.8h, v31.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v23.8h, v23.8h, v28.8h\n"
- "uzp1 v28.8h, v29.8h, v30.8h\n"
+ "uzp1 v18.8h, v29.8h, v30.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
"uzp1 v31.16b, v31.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v23.16b, v23.16b, v28.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v23.16b, v23.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 127f\n"
"tbz x9, #3, 122f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d23, [x21], #0x8\n"
- "str d24, [x20], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x9, #2, 120f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
- "st1 { v23.s }[2], [x21], #0x4\n"
- "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
"tbz x9, #1, 119f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
- "st1 { v23.h }[6], [x21], #0x2\n"
- "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v23.h }[6], [x22], #0x2\n"
+ "st1 { v24.h }[6], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x22]\n"
- "st1 { v23.b }[14], [x21]\n"
- "st1 { v24.b }[14], [x20]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v23.b }[14], [x22]\n"
+ "st1 { v24.b }[14], [x21]\n"
"b 126f\n"
"119:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 126f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x22]\n"
- "st1 { v23.b }[12], [x21]\n"
- "st1 { v24.b }[12], [x20]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v23.b }[12], [x22]\n"
+ "st1 { v24.b }[12], [x21]\n"
"b 126f\n"
"120:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 121f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
- "st1 { v23.h }[4], [x21], #0x2\n"
- "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v23.h }[4], [x22], #0x2\n"
+ "st1 { v24.h }[4], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x22]\n"
- "st1 { v23.b }[10], [x21]\n"
- "st1 { v24.b }[10], [x20]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v23.b }[10], [x22]\n"
+ "st1 { v24.b }[10], [x21]\n"
"b 126f\n"
"121:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 126f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x22]\n"
- "st1 { v23.b }[8], [x21]\n"
- "st1 { v24.b }[8], [x20]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v23.b }[8], [x22]\n"
+ "st1 { v24.b }[8], [x21]\n"
"b 126f\n"
"122:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 124f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x22], #0x4\n"
- "str s23, [x21], #0x4\n"
- "str s24, [x20], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
"tbz x9, #1, 123f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
- "st1 { v23.h }[2], [x21], #0x2\n"
- "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v23.h }[2], [x22], #0x2\n"
+ "st1 { v24.h }[2], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x22]\n"
- "st1 { v23.b }[6], [x21]\n"
- "st1 { v24.b }[6], [x20]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v23.b }[6], [x22]\n"
+ "st1 { v24.b }[6], [x21]\n"
"b 126f\n"
"123:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 126f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x22]\n"
- "st1 { v23.b }[4], [x21]\n"
- "st1 { v24.b }[4], [x20]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v23.b }[4], [x22]\n"
+ "st1 { v24.b }[4], [x21]\n"
"b 126f\n"
"124:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 125f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x22], #0x2\n"
- "str h23, [x21], #0x2\n"
- "str h24, [x20], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h23, [x22], #0x2\n"
+ "str h24, [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x22]\n"
- "st1 { v23.b }[2], [x21]\n"
- "st1 { v24.b }[2], [x20]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "st1 { v24.b }[2], [x21]\n"
"b 126f\n"
"125:" // Height 4: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x22, #0x0]\n"
- "str b23, [x21, #0x0]\n"
- "str b24, [x20, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b23, [x22, #0x0]\n"
+ "str b24, [x21, #0x0]\n"
"126:" // Height 4: Partial direct writeback: Done
"b 128f\n"
"127:" // Height 4: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x22, #0x0]\n"
- "str q23, [x21, #0x0]\n"
- "str q24, [x20, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q24, [x21, #0x0]\n"
"128:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 98b\n"
@@ -2089,7 +2088,6 @@ void a64_hybrid_s8qa_mmla_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"130:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
index b028a8a9a3..2b7531d1e2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -108,5 +108,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
index b97b63cdce..38a57b0741 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
@@ -85,7 +85,6 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 136f\n"
@@ -111,11 +110,11 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x12, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
"cbnz x14, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x12, x12, x20\n"
@@ -132,129 +131,129 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"blt 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x15, #0x20]\n"
+ "ldr d17, [x15, #0x20]\n"
"ldr x20, [x15, #0x28]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x15, #0x30]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x38]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x15, #0x40]\n"
+ "ldr d16, [x15, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x38]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr d17, [x15, #0x40]\n"
"ldr x20, [x15, #0x48]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x15, #0x50]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x58]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr d6, [x15, #0x60]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr d16, [x15, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr d17, [x15, #0x60]\n"
"ldr x20, [x15, #0x68]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x78]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr d6, [x15, #0x80]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr d16, [x15, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x78]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr d17, [x15, #0x80]\n"
"ldr x20, [x15, #0x88]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr d7, [x15, #0x90]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x98]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr d6, [x15, #0xa0]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr d16, [x15, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr d17, [x15, #0xa0]\n"
"ldr x20, [x15, #0xa8]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0xb8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr d6, [x15, #0xc0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr d16, [x15, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr d17, [x15, #0xc0]\n"
"ldr x20, [x15, #0xc8]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr d7, [x15, #0xd0]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0xd8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr d6, [x15, #0xe0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr d16, [x15, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr d17, [x15, #0xe0]\n"
"ldr x20, [x15, #0xe8]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0xf8]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr d16, [x15, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xf8]\n"
+ "mov v16.d[1], x20\n"
"add x12, x12, #0x10\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
"ldr d6, [x15, #0x0]\n"
"ldr x20, [x15, #0x8]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
"sub x13, x13, #0x10\n"
"ldr d7, [x15, #0x10]\n"
"cmp x13, #0x20\n"
- "ldr x10, [x12, #0x8]\n"
+ "ldr x21, [x12, #0x8]\n"
"mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x18]\n"
- "mov v0.d[1], x10\n"
- "mov v7.d[1], x11\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v0.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"prfm pldl1keep, [x12, #0x80]\n"
"bge 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q17, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x15, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x15, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x15, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x15, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x15, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x15, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x15, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x15, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x15, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x15, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x15, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x15, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x15, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x15, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x15, #0xf0]\n"
"add x12, x12, #0x10\n"
"sub x13, x13, #0x10\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
"prfm pldl1keep, [x12, #0x80]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"add x15, x15, #0x100\n"
"9:" // Height 1: Multiply loop: Main loop skip
"cbz x13, 14f\n"
"cmp x13, #0x4\n"
"blt 11f\n"
"10:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x12], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q16, [x15, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
+ ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
+ "ldr q17, [x15, #0x20]\n"
"cmp x13, #0x4\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
"add x15, x15, #0x40\n"
"bge 10b\n"
"11:" // Height 1: Multiply loop: Skip odd blocks
@@ -267,28 +266,28 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
"13:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x0]\n"
+ ".inst 0x4f80e208 // sdot v8.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x20]\n"
+ ".inst 0x4f80e20a // sdot v10.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"add x15, x15, #0x40\n"
"14:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 4b\n"
- "ldr q0, [x6, #0x0]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x6, #0x10]\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x6, #0x20]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "ldr q3, [x6, #0x30]\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q16, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "ldr q16, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v16.4s\n"
+ "ldr q16, [x6, #0x30]\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"prfm pstl1keep, [x17, #0x0]\n"
"add x6, x6, #0x40\n"
"tbz %x[flags], #4, 15f\n"
@@ -304,10 +303,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 16f\n"
"15:" // Height 1: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -320,45 +319,45 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
"sqrdmulh v11.4s, v11.4s, v7.4s\n"
"tbz %x[flags], #5, 17f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v19.16b, v8.16b, v0.16b\n"
+ "and v18.16b, v9.16b, v1.16b\n"
+ "and v17.16b, v10.16b, v2.16b\n"
+ "and v16.16b, v11.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v19.4s\n"
+ "sqadd v9.4s, v9.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
"17:" // Height 1: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v16.4s\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "add v10.4s, v10.4s, v16.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v16.4s\n"
+ "smin v11.4s, v11.4s, v16.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v16.8h, v10.8h, v11.8h\n"
"cmp x16, #0x10\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v8.16b, v8.16b, v16.16b\n"
"bge 26f\n"
"tbz x16, #3, 21f\n"
"str d8, [x17], #0x8\n"
@@ -433,247 +432,247 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"31:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 32f\n"
- "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x12, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x11, [x20, #0x8]\n"
"cbnz x14, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x12, x12, x20\n"
- "add x9, x9, x20\n"
+ "add x11, x11, x20\n"
"b 33f\n"
"32:" // Height 2: setup direct input
"mov x12, %x[input_ptr]\n"
- "add x9, x12, x20\n"
+ "add x11, x12, x21\n"
"33:" // Height 2: input setup done
"cmp x13, #0x10\n"
"blt 36f\n"
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
+ "ldr q1, [x11, #0x0]\n"
"ldr q6, [x15, #0x0]\n"
"ldr q7, [x15, #0x10]\n"
"blt 35f\n"
"34:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x15, #0x20]\n"
+ "ldr d17, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr x11, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x15, #0x30]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr d16, [x15, #0x30]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr d17, [x15, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"ldr x20, [x15, #0x48]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x15, #0x50]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x58]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr x20, [x15, #0x68]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr d6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr x11, [x15, #0x78]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr d6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr d16, [x15, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr d17, [x15, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr d16, [x15, #0x70]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr d17, [x15, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
"ldr x20, [x15, #0x88]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr d7, [x15, #0x90]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x98]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr x20, [x15, #0xa8]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr d6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr x11, [x15, #0xb8]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr d6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr d16, [x15, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr x21, [x15, #0xa8]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr d17, [x15, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr d16, [x15, #0xb0]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr d17, [x15, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
"ldr x20, [x15, #0xc8]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr d7, [x15, #0xd0]\n"
- "mov v6.d[1], x20\n"
- "ldr x11, [x15, #0xd8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr x20, [x15, #0xe8]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr d6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr x11, [x15, #0xf8]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v6.d[1], x20\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr d16, [x15, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0xe8]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr d17, [x15, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr x20, [x15, #0xf8]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr d16, [x15, #0xf0]\n"
+ "mov v17.d[1], x21\n"
"add x12, x12, #0x10\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x11, x11, #0x10\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
"ldr d6, [x15, #0x0]\n"
- "ldr x20, [x15, #0x8]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0x8]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
+ "ldr d1, [x11, #0x0]\n"
"sub x13, x13, #0x10\n"
"ldr d7, [x15, #0x10]\n"
"cmp x13, #0x20\n"
- "ldr x10, [x12, #0x8]\n"
- "mov v6.d[1], x20\n"
- "ldr x28, [x9, #0x8]\n"
- "mov v0.d[1], x10\n"
- "ldr x11, [x15, #0x18]\n"
- "mov v1.d[1], x28\n"
+ "ldr x20, [x12, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v1.d[1], x21\n"
"prfm pldl1keep, [x12, #0x80]\n"
- "mov v7.d[1], x11\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
"bge 34b\n"
"35:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q17, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"sub x13, x13, #0x10\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x15, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x15, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x15, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x15, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x15, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x15, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x15, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x15, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x15, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x15, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x15, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x15, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x15, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x15, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x15, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x15, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x15, #0xf0]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"36:" // Height 2: Multiply loop: Main loop skip
"cbz x13, 41f\n"
"cmp x13, #0x4\n"
"blt 38f\n"
"37:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x12], #0x4\n"
+ "ldr s19, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s18, [x11], #0x4\n"
"cmp x13, #0x4\n"
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q17, [x15, #0x0]\n"
+ ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
+ ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
+ "ldr q17, [x15, #0x20]\n"
+ ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
"bge 37b\n"
"38:" // Height 2: Multiply loop: Skip odd blocks
"cbz x13, 41f\n"
"tbz x13, #1, 39f\n"
"ldr h0, [x12], #0x2\n"
- "ldr h1, [x9], #0x2\n"
+ "ldr h1, [x11], #0x2\n"
"tbz x13, #0, 40f\n"
"ld1 { v0.b }[2], [x12]\n"
- "ld1 { v1.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x11]\n"
"b 40f\n"
"39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
+ "ldr b1, [x11, #0x0]\n"
"40:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q17, [x15, #0x0]\n"
+ ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x15, #0x10]\n"
+ ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x15, #0x20]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x15, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"41:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 31b\n"
- "ldr q0, [x6, #0x0]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x6, #0x10]\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x6, #0x20]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "ldr q3, [x6, #0x30]\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q19, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "ldr q18, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "ldr q17, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v17.4s\n"
+ "ldr q16, [x6, #0x30]\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x17, x20\n"
+ "add x25, x17, x20\n"
"prfm pstl1keep, [x17, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
+ "add v12.4s, v12.4s, v19.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v13.4s, v13.4s, v18.4s\n"
+ "add v14.4s, v14.4s, v17.4s\n"
+ "add v15.4s, v15.4s, v16.4s\n"
"add x6, x6, #0x40\n"
"tbz %x[flags], #4, 42f\n"
"ldr q0, [x8, #0x0]\n"
@@ -688,10 +687,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 43f\n"
"42:" // Height 2: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -708,30 +707,30 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sqrdmulh v14.4s, v14.4s, v6.4s\n"
"sqrdmulh v15.4s, v15.4s, v7.4s\n"
"tbz %x[flags], #5, 44f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v19.16b, v8.16b, v0.16b\n"
+ "and v18.16b, v9.16b, v1.16b\n"
+ "and v17.16b, v10.16b, v2.16b\n"
+ "and v16.16b, v11.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v19.4s\n"
+ "sqadd v9.4s, v9.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
+ "and v19.16b, v12.16b, v0.16b\n"
+ "and v18.16b, v13.16b, v1.16b\n"
+ "and v17.16b, v14.16b, v2.16b\n"
+ "and v16.16b, v15.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v19.4s\n"
+ "sqadd v13.4s, v13.4s, v18.4s\n"
+ "sqadd v14.4s, v14.4s, v17.4s\n"
+ "sqadd v15.4s, v15.4s, v16.4s\n"
"44:" // Height 2: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
@@ -741,108 +740,108 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v13.4s, v13.4s, v1.4s\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v16.4s\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "add v10.4s, v10.4s, v16.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
+ "add v12.4s, v12.4s, v16.4s\n"
+ "add v13.4s, v13.4s, v16.4s\n"
+ "add v14.4s, v14.4s, v16.4s\n"
+ "add v15.4s, v15.4s, v16.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v16.4s\n"
+ "smin v11.4s, v11.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v17.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v14.8h, v15.8h\n"
"cmp x16, #0x10\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v8.16b, v8.16b, v17.16b\n"
+ "uzp1 v12.16b, v12.16b, v16.16b\n"
"bge 53f\n"
"tbz x16, #3, 48f\n"
"str d8, [x17], #0x8\n"
- "str d12, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x16, #2, 46f\n"
"st1 { v8.s }[2], [x17], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
"tbz x16, #1, 45f\n"
"st1 { v8.h }[6], [x17], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
"tbz x16, #0, 52f\n"
"st1 { v8.b }[14], [x17]\n"
- "st1 { v12.b }[14], [x24]\n"
+ "st1 { v12.b }[14], [x25]\n"
"b 52f\n"
"45:" // Height 2: Partial direct writeback: partial_1_12
"tbz x16, #0, 52f\n"
"st1 { v8.b }[12], [x17]\n"
- "st1 { v12.b }[12], [x24]\n"
+ "st1 { v12.b }[12], [x25]\n"
"b 52f\n"
"46:" // Height 2: Partial direct writeback: partial_2_8
"tbz x16, #1, 47f\n"
"st1 { v8.h }[4], [x17], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
"tbz x16, #0, 52f\n"
"st1 { v8.b }[10], [x17]\n"
- "st1 { v12.b }[10], [x24]\n"
+ "st1 { v12.b }[10], [x25]\n"
"b 52f\n"
"47:" // Height 2: Partial direct writeback: partial_1_8
"tbz x16, #0, 52f\n"
"st1 { v8.b }[8], [x17]\n"
- "st1 { v12.b }[8], [x24]\n"
+ "st1 { v12.b }[8], [x25]\n"
"b 52f\n"
"48:" // Height 2: Partial direct writeback: partial_4_0
"tbz x16, #2, 50f\n"
"str s8, [x17], #0x4\n"
- "str s12, [x24], #0x4\n"
+ "str s12, [x25], #0x4\n"
"tbz x16, #1, 49f\n"
"st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
"tbz x16, #0, 52f\n"
"st1 { v8.b }[6], [x17]\n"
- "st1 { v12.b }[6], [x24]\n"
+ "st1 { v12.b }[6], [x25]\n"
"b 52f\n"
"49:" // Height 2: Partial direct writeback: partial_1_4
"tbz x16, #0, 52f\n"
"st1 { v8.b }[4], [x17]\n"
- "st1 { v12.b }[4], [x24]\n"
+ "st1 { v12.b }[4], [x25]\n"
"b 52f\n"
"50:" // Height 2: Partial direct writeback: partial_2_0
"tbz x16, #1, 51f\n"
"str h8, [x17], #0x2\n"
- "str h12, [x24], #0x2\n"
+ "str h12, [x25], #0x2\n"
"tbz x16, #0, 52f\n"
"st1 { v8.b }[2], [x17]\n"
- "st1 { v12.b }[2], [x24]\n"
+ "st1 { v12.b }[2], [x25]\n"
"b 52f\n"
"51:" // Height 2: Partial direct writeback: partial_1_0
"str b8, [x17, #0x0]\n"
- "str b12, [x24, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
"52:" // Height 2: Partial direct writeback: Done
"b 54f\n"
"53:" // Height 2: Full writeback
"str q8, [x17, #0x0]\n"
"add x17, x17, #0x10\n"
- "str q12, [x24, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
"54:" // Height 2: Writeback done
"subs x16, x16, #0x10\n"
"bgt 29b\n"
@@ -872,308 +871,308 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"58:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x12, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x11, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
"cbnz x14, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x12, x12, x20\n"
- "add x9, x9, x20\n"
- "add x27, x27, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"b 60f\n"
"59:" // Height 3: setup direct input
"mov x12, %x[input_ptr]\n"
- "add x9, x12, x20\n"
- "add x27, x9, x20\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
"60:" // Height 3: input setup done
"cmp x13, #0x10\n"
"blt 63f\n"
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
+ "ldr q1, [x11, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
"ldr q6, [x15, #0x0]\n"
"ldr q7, [x15, #0x10]\n"
"blt 62f\n"
"61:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x15, #0x20]\n"
+ "ldr d21, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x48]\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x15, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x58]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x68]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x15, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x78]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr d6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0x88]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x98]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr d6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0xa8]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr d7, [x15, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xb8]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr d6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xc8]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xd8]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr d6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xe8]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr d7, [x15, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x15, #0xf8]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr d6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr d20, [x15, #0x30]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr d21, [x15, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ "ldr x21, [x15, #0x68]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr d20, [x15, #0x50]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr d21, [x15, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0x88]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr d20, [x15, #0x70]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ "ldr x20, [x15, #0x98]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr d21, [x15, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0xa8]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr d20, [x15, #0x90]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr d21, [x15, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xc8]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr d20, [x15, #0xb0]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ "ldr x20, [x15, #0xd8]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr d21, [x15, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xe8]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr d20, [x15, #0xd0]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ "ldr x20, [x15, #0xf8]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr d21, [x15, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
"add x12, x12, #0x10\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr d20, [x15, #0xf0]\n"
+ "mov v20.d[1], x20\n"
+ "add x11, x11, #0x10\n"
+ "add x10, x10, #0x10\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
"ldr x20, [x15, #0x8]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x10, [x12, #0x8]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ "ldr x23, [x12, #0x8]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
"ldr d6, [x15, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
+ "ldr d1, [x11, #0x0]\n"
+ "ldr x22, [x11, #0x8]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
+ "ldr d2, [x10, #0x0]\n"
"sub x13, x13, #0x10\n"
"ldr d7, [x15, #0x10]\n"
"cmp x13, #0x20\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr x21, [x10, #0x8]\n"
"mov v6.d[1], x20\n"
- "ldr x11, [x15, #0x18]\n"
- "mov v0.d[1], x10\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v0.d[1], x23\n"
"prfm pldl1keep, [x12, #0x80]\n"
- "mov v1.d[1], x28\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "mov v2.d[1], x26\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "mov v7.d[1], x11\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "mov v7.d[1], x20\n"
"bge 61b\n"
"62:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q20, [x15, #0x30]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x15, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x15, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x15, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x15, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x15, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x15, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x15, #0x50]\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x15, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x15, #0x70]\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x15, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x15, #0x90]\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x15, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x15, #0xb0]\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x15, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x15, #0xd0]\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x15, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x15, #0xf0]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"63:" // Height 3: Multiply loop: Main loop skip
"cbz x13, 68f\n"
"cmp x13, #0x4\n"
"blt 65f\n"
"64:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x12], #0x4\n"
+ "ldr s24, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s23, [x11], #0x4\n"
"cmp x13, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s22, [x10], #0x4\n"
+ "ldr q21, [x15, #0x0]\n"
+ ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
+ "ldr q20, [x15, #0x10]\n"
+ ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
+ "ldr q21, [x15, #0x20]\n"
+ ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
+ "ldr q20, [x15, #0x30]\n"
+ ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
+ ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n"
"bge 64b\n"
"65:" // Height 3: Multiply loop: Skip odd blocks
"cbz x13, 68f\n"
"tbz x13, #1, 66f\n"
"ldr h0, [x12], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x11], #0x2\n"
+ "ldr h2, [x10], #0x2\n"
"tbz x13, #0, 67f\n"
"ld1 { v0.b }[2], [x12]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v1.b }[2], [x11]\n"
+ "ld1 { v2.b }[2], [x10]\n"
"b 67f\n"
"66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
+ "ldr b1, [x11, #0x0]\n"
+ "ldr b2, [x10, #0x0]\n"
"67:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q21, [x15, #0x0]\n"
+ ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x15, #0x10]\n"
+ ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x15, #0x20]\n"
+ ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x15, #0x30]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
"68:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 58b\n"
- "ldr q0, [x6, #0x0]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x6, #0x10]\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x6, #0x20]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "ldr q3, [x6, #0x30]\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q23, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v23.4s\n"
+ "ldr q22, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v22.4s\n"
+ "ldr q21, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v21.4s\n"
+ "ldr q20, [x6, #0x30]\n"
+ "add v11.4s, v11.4s, v20.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x17, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x17, x20\n"
+ "add x24, x25, x20\n"
"prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v12.4s, v12.4s, v23.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v22.4s\n"
+ "add v14.4s, v14.4s, v21.4s\n"
+ "add v15.4s, v15.4s, v20.4s\n"
+ "add v16.4s, v16.4s, v23.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
"add x6, x6, #0x40\n"
"tbz %x[flags], #4, 69f\n"
"ldr q0, [x8, #0x0]\n"
@@ -1188,10 +1187,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 70f\n"
"69:" // Height 3: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -1212,42 +1211,42 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sqrdmulh v18.4s, v18.4s, v6.4s\n"
"sqrdmulh v19.4s, v19.4s, v7.4s\n"
"tbz %x[flags], #5, 71f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v23.16b, v8.16b, v0.16b\n"
+ "and v22.16b, v9.16b, v1.16b\n"
+ "and v21.16b, v10.16b, v2.16b\n"
+ "and v20.16b, v11.16b, v3.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sqadd v9.4s, v9.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sqadd v11.4s, v11.4s, v20.4s\n"
+ "and v23.16b, v12.16b, v0.16b\n"
+ "and v22.16b, v13.16b, v1.16b\n"
+ "and v21.16b, v14.16b, v2.16b\n"
+ "and v20.16b, v15.16b, v3.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v23.4s\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sqadd v15.4s, v15.4s, v20.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v1.16b\n"
+ "and v21.16b, v18.16b, v2.16b\n"
+ "and v20.16b, v19.16b, v3.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"71:" // Height 3: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
@@ -1261,139 +1260,139 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
"srshl v19.4s, v19.4s, v3.4s\n"
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v20.4s\n"
+ "add v9.4s, v9.4s, v20.4s\n"
+ "add v10.4s, v10.4s, v20.4s\n"
+ "add v11.4s, v11.4s, v20.4s\n"
+ "add v12.4s, v12.4s, v20.4s\n"
+ "add v13.4s, v13.4s, v20.4s\n"
+ "add v14.4s, v14.4s, v20.4s\n"
+ "add v15.4s, v15.4s, v20.4s\n"
+ "add v16.4s, v16.4s, v20.4s\n"
+ "add v17.4s, v17.4s, v20.4s\n"
+ "add v18.4s, v18.4s, v20.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v20.4s\n"
+ "smin v9.4s, v9.4s, v20.4s\n"
+ "smin v10.4s, v10.4s, v20.4s\n"
+ "smin v11.4s, v11.4s, v20.4s\n"
+ "smin v12.4s, v12.4s, v20.4s\n"
+ "smin v13.4s, v13.4s, v20.4s\n"
+ "smin v14.4s, v14.4s, v20.4s\n"
+ "smin v15.4s, v15.4s, v20.4s\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smax v8.4s, v8.4s, v20.4s\n"
+ "smax v9.4s, v9.4s, v20.4s\n"
+ "smax v10.4s, v10.4s, v20.4s\n"
+ "smax v11.4s, v11.4s, v20.4s\n"
+ "smax v12.4s, v12.4s, v20.4s\n"
+ "smax v13.4s, v13.4s, v20.4s\n"
+ "smax v14.4s, v14.4s, v20.4s\n"
+ "smax v15.4s, v15.4s, v20.4s\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v21.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v20.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
"cmp x16, #0x10\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v8.16b, v8.16b, v21.16b\n"
+ "uzp1 v12.16b, v12.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 80f\n"
"tbz x16, #3, 75f\n"
"str d8, [x17], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x16, #2, 73f\n"
"st1 { v8.s }[2], [x17], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
"tbz x16, #1, 72f\n"
"st1 { v8.h }[6], [x17], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
"tbz x16, #0, 79f\n"
"st1 { v8.b }[14], [x17]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
"b 79f\n"
"72:" // Height 3: Partial direct writeback: partial_1_12
"tbz x16, #0, 79f\n"
"st1 { v8.b }[12], [x17]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
"b 79f\n"
"73:" // Height 3: Partial direct writeback: partial_2_8
"tbz x16, #1, 74f\n"
"st1 { v8.h }[4], [x17], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
"tbz x16, #0, 79f\n"
"st1 { v8.b }[10], [x17]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
"b 79f\n"
"74:" // Height 3: Partial direct writeback: partial_1_8
"tbz x16, #0, 79f\n"
"st1 { v8.b }[8], [x17]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
"b 79f\n"
"75:" // Height 3: Partial direct writeback: partial_4_0
"tbz x16, #2, 77f\n"
"str s8, [x17], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
"tbz x16, #1, 76f\n"
"st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
"tbz x16, #0, 79f\n"
"st1 { v8.b }[6], [x17]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
"b 79f\n"
"76:" // Height 3: Partial direct writeback: partial_1_4
"tbz x16, #0, 79f\n"
"st1 { v8.b }[4], [x17]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
"b 79f\n"
"77:" // Height 3: Partial direct writeback: partial_2_0
"tbz x16, #1, 78f\n"
"str h8, [x17], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
"tbz x16, #0, 79f\n"
"st1 { v8.b }[2], [x17]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
"b 79f\n"
"78:" // Height 3: Partial direct writeback: partial_1_0
"str b8, [x17, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
"79:" // Height 3: Partial direct writeback: Done
"b 81f\n"
"80:" // Height 3: Full writeback
"str q8, [x17, #0x0]\n"
"add x17, x17, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
"81:" // Height 3: Writeback done
"subs x16, x16, #0x10\n"
"bgt 56b\n"
@@ -1427,369 +1426,369 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"85:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 86f\n"
- "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x12, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x11, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x9, [x20, #0x18]\n"
"cbnz x14, 87f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
"b 87f\n"
"86:" // Height 4: setup direct input
"mov x12, %x[input_ptr]\n"
- "add x9, x12, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
"87:" // Height 4: input setup done
"cmp x13, #0x10\n"
"blt 90f\n"
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
+ "ldr q1, [x11, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x9, #0x0]\n"
"ldr q6, [x15, #0x0]\n"
"ldr q7, [x15, #0x10]\n"
"blt 89f\n"
"88:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x15, #0x20]\n"
+ "ldr d25, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x48]\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x15, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x58]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x68]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x15, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x78]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr x10, [x12, #0x8]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr d6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0x88]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x98]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr x26, [x27, #0x8]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr d6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0xa8]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr x24, [x25, #0x8]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr d7, [x15, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xb8]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr d24, [x15, #0x30]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr d25, [x15, #0x40]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ "ldr x21, [x15, #0x68]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr d24, [x15, #0x50]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ "ldr x25, [x12, #0x8]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr d25, [x15, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0x88]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ "ldr x24, [x11, #0x8]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr d24, [x15, #0x70]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ "ldr x20, [x15, #0x98]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr d25, [x15, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0xa8]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ "ldr x22, [x9, #0x8]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr d24, [x15, #0x90]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
"sub x13, x13, #0x10\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr d6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xc8]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr d25, [x15, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xc8]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
"cmp x13, #0x20\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xd8]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr d24, [x15, #0xb0]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ "ldr x20, [x15, #0xd8]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
"prfm pldl1keep, [x12, #0x80]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr d6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xe8]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr d25, [x15, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xe8]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr d24, [x15, #0xd0]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ "ldr x20, [x15, #0xf8]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr d25, [x15, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr d7, [x15, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x15, #0xf8]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr d6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr d24, [x15, #0xf0]\n"
+ "mov v24.d[1], x20\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x20, [x15, #0x8]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x15, #0x18]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0x8]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ "ldr x20, [x15, #0x18]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
"ldr d6, [x15, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
+ "ldr d1, [x11, #0x0]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
+ "ldr d2, [x10, #0x0]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
+ "ldr d3, [x9, #0x0]\n"
"ldr d7, [x15, #0x10]\n"
- "mov v6.d[1], x20\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
- "mov v7.d[1], x11\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x25\n"
+ "mov v1.d[1], x24\n"
+ "mov v2.d[1], x23\n"
+ "mov v3.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 88b\n"
"89:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q24, [x15, #0x30]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x15, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x15, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x15, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x15, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x15, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x15, #0x40]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x15, #0x50]\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x15, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x15, #0x70]\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x15, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x15, #0x90]\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x15, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x15, #0xb0]\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x15, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x15, #0xd0]\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x15, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x15, #0xf0]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"90:" // Height 4: Multiply loop: Main loop skip
"cbz x13, 95f\n"
"cmp x13, #0x4\n"
"blt 92f\n"
"91:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x12], #0x4\n"
+ "ldr s29, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s28, [x11], #0x4\n"
"cmp x13, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s26, [x9], #0x4\n"
+ "ldr q25, [x15, #0x0]\n"
+ ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
+ "ldr q24, [x15, #0x10]\n"
+ ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
+ "ldr q25, [x15, #0x20]\n"
+ ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
+ "ldr q24, [x15, #0x30]\n"
+ ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n"
+ ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n"
"bge 91b\n"
"92:" // Height 4: Multiply loop: Skip odd blocks
"cbz x13, 95f\n"
"tbz x13, #1, 93f\n"
"ldr h0, [x12], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
+ "ldr h1, [x11], #0x2\n"
+ "ldr h2, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
"tbz x13, #0, 94f\n"
"ld1 { v0.b }[2], [x12]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x11]\n"
+ "ld1 { v2.b }[2], [x10]\n"
+ "ld1 { v3.b }[2], [x9]\n"
"b 94f\n"
"93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
+ "ldr b1, [x11, #0x0]\n"
+ "ldr b2, [x10, #0x0]\n"
+ "ldr b3, [x9, #0x0]\n"
"94:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q25, [x15, #0x0]\n"
+ ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
+ "ldr q24, [x15, #0x10]\n"
+ ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x15, #0x20]\n"
+ ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x15, #0x30]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
"95:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 85b\n"
- "ldr q0, [x6, #0x0]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x6, #0x10]\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x6, #0x20]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "ldr q3, [x6, #0x30]\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q27, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v27.4s\n"
+ "ldr q26, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v26.4s\n"
+ "ldr q25, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v25.4s\n"
+ "ldr q24, [x6, #0x30]\n"
+ "add v11.4s, v11.4s, v24.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x17, x20\n"
+ "add x25, x17, x20\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
- "add x22, x23, x20\n"
"prfm pstl1keep, [x17, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v27.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v13.4s, v13.4s, v26.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v25.4s\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "add v15.4s, v15.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v24.4s\n"
+ "add v16.4s, v16.4s, v27.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
"add x6, x6, #0x40\n"
"tbz %x[flags], #4, 96f\n"
"ldr q0, [x8, #0x0]\n"
@@ -1804,10 +1803,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 97f\n"
"96:" // Height 4: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -1832,54 +1831,54 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sqrdmulh v22.4s, v22.4s, v6.4s\n"
"sqrdmulh v23.4s, v23.4s, v7.4s\n"
"tbz %x[flags], #5, 98f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v20.16b, v0.16b\n"
- "and v5.16b, v21.16b, v1.16b\n"
- "and v6.16b, v22.16b, v2.16b\n"
- "and v7.16b, v23.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v7.4s\n"
+ "and v27.16b, v8.16b, v0.16b\n"
+ "and v26.16b, v9.16b, v1.16b\n"
+ "and v25.16b, v10.16b, v2.16b\n"
+ "and v24.16b, v11.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "sqadd v9.4s, v9.4s, v26.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v24.4s\n"
+ "and v27.16b, v12.16b, v0.16b\n"
+ "and v26.16b, v13.16b, v1.16b\n"
+ "and v25.16b, v14.16b, v2.16b\n"
+ "and v24.16b, v15.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v27.4s\n"
+ "sqadd v13.4s, v13.4s, v26.4s\n"
+ "sqadd v14.4s, v14.4s, v25.4s\n"
+ "sqadd v15.4s, v15.4s, v24.4s\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v1.16b\n"
+ "and v25.16b, v18.16b, v2.16b\n"
+ "and v24.16b, v19.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "sqadd v18.4s, v18.4s, v25.4s\n"
+ "sqadd v19.4s, v19.4s, v24.4s\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v1.16b\n"
+ "and v25.16b, v22.16b, v2.16b\n"
+ "and v24.16b, v23.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sqadd v21.4s, v21.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"98:" // Height 4: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
@@ -1897,170 +1896,170 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v21.4s, v21.4s, v1.4s\n"
"srshl v22.4s, v22.4s, v2.4s\n"
"srshl v23.4s, v23.4s, v3.4s\n"
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v24.4s\n"
+ "add v9.4s, v9.4s, v24.4s\n"
+ "add v10.4s, v10.4s, v24.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
+ "add v12.4s, v12.4s, v24.4s\n"
+ "add v13.4s, v13.4s, v24.4s\n"
+ "add v14.4s, v14.4s, v24.4s\n"
+ "add v15.4s, v15.4s, v24.4s\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v24.4s\n"
+ "add v18.4s, v18.4s, v24.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v24.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v24.4s\n"
+ "smin v9.4s, v9.4s, v24.4s\n"
+ "smin v10.4s, v10.4s, v24.4s\n"
+ "smin v11.4s, v11.4s, v24.4s\n"
+ "smin v12.4s, v12.4s, v24.4s\n"
+ "smin v13.4s, v13.4s, v24.4s\n"
+ "smin v14.4s, v14.4s, v24.4s\n"
+ "smin v15.4s, v15.4s, v24.4s\n"
+ "smin v16.4s, v16.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v24.4s\n"
+ "smin v18.4s, v18.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v24.4s\n"
+ "smin v22.4s, v22.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smax v8.4s, v8.4s, v24.4s\n"
+ "smax v9.4s, v9.4s, v24.4s\n"
+ "smax v10.4s, v10.4s, v24.4s\n"
+ "smax v11.4s, v11.4s, v24.4s\n"
+ "smax v12.4s, v12.4s, v24.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smax v14.4s, v14.4s, v24.4s\n"
+ "smax v15.4s, v15.4s, v24.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v25.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v24.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v17.8h, v22.8h, v23.8h\n"
"cmp x16, #0x10\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v8.16b, v8.16b, v25.16b\n"
+ "uzp1 v12.16b, v12.16b, v24.16b\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 107f\n"
"tbz x16, #3, 102f\n"
"str d8, [x17], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x16, #2, 100f\n"
"st1 { v8.s }[2], [x17], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x16, #1, 99f\n"
"st1 { v8.h }[6], [x17], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
"tbz x16, #0, 106f\n"
"st1 { v8.b }[14], [x17]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
- "st1 { v20.b }[14], [x22]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 106f\n"
"99:" // Height 4: Partial direct writeback: partial_1_12
"tbz x16, #0, 106f\n"
"st1 { v8.b }[12], [x17]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
- "st1 { v20.b }[12], [x22]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 106f\n"
"100:" // Height 4: Partial direct writeback: partial_2_8
"tbz x16, #1, 101f\n"
"st1 { v8.h }[4], [x17], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
"tbz x16, #0, 106f\n"
"st1 { v8.b }[10], [x17]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
- "st1 { v20.b }[10], [x22]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 106f\n"
"101:" // Height 4: Partial direct writeback: partial_1_8
"tbz x16, #0, 106f\n"
"st1 { v8.b }[8], [x17]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
- "st1 { v20.b }[8], [x22]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 106f\n"
"102:" // Height 4: Partial direct writeback: partial_4_0
"tbz x16, #2, 104f\n"
"str s8, [x17], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
- "str s20, [x22], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x16, #1, 103f\n"
"st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
"tbz x16, #0, 106f\n"
"st1 { v8.b }[6], [x17]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
- "st1 { v20.b }[6], [x22]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 106f\n"
"103:" // Height 4: Partial direct writeback: partial_1_4
"tbz x16, #0, 106f\n"
"st1 { v8.b }[4], [x17]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
- "st1 { v20.b }[4], [x22]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 106f\n"
"104:" // Height 4: Partial direct writeback: partial_2_0
"tbz x16, #1, 105f\n"
"str h8, [x17], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
- "str h20, [x22], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
"tbz x16, #0, 106f\n"
"st1 { v8.b }[2], [x17]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
- "st1 { v20.b }[2], [x22]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 106f\n"
"105:" // Height 4: Partial direct writeback: partial_1_0
"str b8, [x17, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
- "str b20, [x22, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"106:" // Height 4: Partial direct writeback: Done
"b 108f\n"
"107:" // Height 4: Full writeback
"str q8, [x17, #0x0]\n"
"add x17, x17, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
- "str q20, [x22, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"108:" // Height 4: Writeback done
"subs x16, x16, #0x10\n"
"bgt 83b\n"
@@ -2098,430 +2097,430 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"112:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 113f\n"
- "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x12, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x11, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x9, [x20, #0x18]\n"
+ "ldr x28, [x20, #0x20]\n"
"cbnz x14, 114f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
- "add x23, x23, x20\n"
+ "add x28, x28, x20\n"
"b 114f\n"
"113:" // Height 5: setup direct input
"mov x12, %x[input_ptr]\n"
- "add x9, x12, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
- "add x23, x25, x20\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
+ "add x28, x9, x21\n"
"114:" // Height 5: input setup done
"cmp x13, #0x10\n"
"blt 117f\n"
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
+ "ldr q1, [x11, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
"ldr q6, [x15, #0x0]\n"
"ldr q7, [x15, #0x10]\n"
"blt 116f\n"
"115:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr d6, [x15, #0x20]\n"
+ "ldr d29, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x48]\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr d7, [x15, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x58]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x10, [x12, #0x8]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr d6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x68]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x26, [x27, #0x8]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr d7, [x15, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x78]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr x24, [x25, #0x8]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr x22, [x23, #0x8]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr d6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0x88]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr d28, [x15, #0x30]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ "ldr x26, [x12, #0x8]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr d29, [x15, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ "ldr x21, [x15, #0x68]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ "ldr x25, [x11, #0x8]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ "ldr x24, [x10, #0x8]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr d28, [x15, #0x50]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ "ldr x23, [x9, #0x8]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ "ldr x22, [x28, #0x8]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr d29, [x15, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0x88]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
"sub x13, x13, #0x10\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
"cmp x13, #0x20\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr d7, [x15, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x98]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr d28, [x15, #0x70]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ "ldr x20, [x15, #0x98]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
"prfm pldl1keep, [x12, #0x80]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr d29, [x15, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ "ldr x21, [x15, #0xa8]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr d6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0xa8]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr d7, [x15, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xb8]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr d6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xc8]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr d7, [x15, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xd8]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr d6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xe8]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr d7, [x15, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x15, #0xf8]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr d6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x20\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr d7, [x15, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr d28, [x15, #0x90]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr d29, [x15, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xc8]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr d28, [x15, #0xb0]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ "ldr x20, [x15, #0xd8]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr d29, [x15, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x15, #0xe8]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr d28, [x15, #0xd0]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ "ldr x20, [x15, #0xf8]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr d29, [x15, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr d28, [x15, #0xf0]\n"
+ "mov v28.d[1], x20\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x20, [x15, #0x8]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x15, #0x18]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ "ldr x21, [x15, #0x8]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ "ldr x20, [x15, #0x18]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
"ldr d6, [x15, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
+ "ldr d1, [x11, #0x0]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
+ "ldr d2, [x10, #0x0]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
+ "ldr d3, [x9, #0x0]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
+ "ldr d4, [x28, #0x0]\n"
"ldr d7, [x15, #0x10]\n"
- "mov v6.d[1], x20\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x26\n"
+ "mov v1.d[1], x25\n"
+ "mov v2.d[1], x24\n"
+ "mov v3.d[1], x23\n"
"mov v4.d[1], x22\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"bge 115b\n"
"116:" // Height 5: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x15, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x15, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x15, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x15, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x15, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x15, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x15, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x15, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x15, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x15, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x15, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q28, [x15, #0x30]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x15, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x15, #0x50]\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x15, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x15, #0x70]\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x15, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x15, #0x90]\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x15, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x15, #0xb0]\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x15, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x15, #0xd0]\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x15, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x15, #0xf0]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
"add x15, x15, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"117:" // Height 5: Multiply loop: Main loop skip
"cbz x13, 122f\n"
"cmp x13, #0x4\n"
"blt 119f\n"
"118:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x12], #0x4\n"
+ "ldr s2, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s1, [x11], #0x4\n"
"cmp x13, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s31, [x9], #0x4\n"
+ "ldr s30, [x28], #0x4\n"
+ "ldr q29, [x15, #0x0]\n"
+ ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
+ "ldr q28, [x15, #0x10]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
+ "ldr q29, [x15, #0x20]\n"
+ ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
+ "ldr q28, [x15, #0x30]\n"
+ ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n"
+ ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n"
"bge 118b\n"
"119:" // Height 5: Multiply loop: Skip odd blocks
"cbz x13, 122f\n"
"tbz x13, #1, 120f\n"
"ldr h0, [x12], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h1, [x11], #0x2\n"
+ "ldr h2, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
+ "ldr h4, [x28], #0x2\n"
"tbz x13, #0, 121f\n"
"ld1 { v0.b }[2], [x12]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x11]\n"
+ "ld1 { v2.b }[2], [x10]\n"
+ "ld1 { v3.b }[2], [x9]\n"
+ "ld1 { v4.b }[2], [x28]\n"
"b 121f\n"
"120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
+ "ldr b1, [x11, #0x0]\n"
+ "ldr b2, [x10, #0x0]\n"
+ "ldr b3, [x9, #0x0]\n"
+ "ldr b4, [x28, #0x0]\n"
"121:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q29, [x15, #0x0]\n"
+ ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
+ "ldr q28, [x15, #0x10]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x15, #0x20]\n"
+ ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x15, #0x30]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
"122:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 112b\n"
- "ldr q0, [x6, #0x0]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x6, #0x10]\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x6, #0x20]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "ldr q3, [x6, #0x30]\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q31, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v31.4s\n"
+ "ldr q30, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v30.4s\n"
+ "ldr q29, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v29.4s\n"
+ "ldr q28, [x6, #0x30]\n"
+ "add v11.4s, v11.4s, v28.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x17, x20\n"
+ "add x25, x17, x20\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
"add x22, x23, x20\n"
- "add x21, x22, x20\n"
"prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v12.4s, v12.4s, v31.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
+ "add v13.4s, v13.4s, v30.4s\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v29.4s\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "add v15.4s, v15.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v28.4s\n"
+ "add v16.4s, v16.4s, v31.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v31.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
"add x6, x6, #0x40\n"
"tbz %x[flags], #4, 123f\n"
"ldr q0, [x8, #0x0]\n"
@@ -2536,10 +2535,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 124f\n"
"123:" // Height 5: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -2568,66 +2567,66 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sqrdmulh v26.4s, v26.4s, v6.4s\n"
"sqrdmulh v27.4s, v27.4s, v7.4s\n"
"tbz %x[flags], #5, 125f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v20.16b, v0.16b\n"
- "and v5.16b, v21.16b, v1.16b\n"
- "and v6.16b, v22.16b, v2.16b\n"
- "and v7.16b, v23.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v7.4s\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v5.16b, v25.16b, v1.16b\n"
- "and v6.16b, v26.16b, v2.16b\n"
- "and v7.16b, v27.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v7.4s\n"
+ "and v31.16b, v8.16b, v0.16b\n"
+ "and v30.16b, v9.16b, v1.16b\n"
+ "and v29.16b, v10.16b, v2.16b\n"
+ "and v28.16b, v11.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v31.4s\n"
+ "sqadd v9.4s, v9.4s, v30.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "and v31.16b, v12.16b, v0.16b\n"
+ "and v30.16b, v13.16b, v1.16b\n"
+ "and v29.16b, v14.16b, v2.16b\n"
+ "and v28.16b, v15.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v31.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqadd v14.4s, v14.4s, v29.4s\n"
+ "sqadd v15.4s, v15.4s, v28.4s\n"
+ "and v31.16b, v16.16b, v0.16b\n"
+ "and v30.16b, v17.16b, v1.16b\n"
+ "and v29.16b, v18.16b, v2.16b\n"
+ "and v28.16b, v19.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v31.4s\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v29.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "and v31.16b, v20.16b, v0.16b\n"
+ "and v30.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v22.16b, v2.16b\n"
+ "and v28.16b, v23.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v31.4s\n"
+ "sqadd v21.4s, v21.4s, v30.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sqadd v23.4s, v23.4s, v28.4s\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v1.16b\n"
+ "and v29.16b, v26.16b, v2.16b\n"
+ "and v28.16b, v27.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "sqadd v27.4s, v27.4s, v28.4s\n"
"125:" // Height 5: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
@@ -2649,201 +2648,201 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v25.4s, v25.4s, v1.4s\n"
"srshl v26.4s, v26.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v28.4s\n"
+ "add v9.4s, v9.4s, v28.4s\n"
+ "add v10.4s, v10.4s, v28.4s\n"
+ "add v11.4s, v11.4s, v28.4s\n"
+ "add v12.4s, v12.4s, v28.4s\n"
+ "add v13.4s, v13.4s, v28.4s\n"
+ "add v14.4s, v14.4s, v28.4s\n"
+ "add v15.4s, v15.4s, v28.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v28.4s\n"
+ "add v18.4s, v18.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v28.4s\n"
+ "smin v9.4s, v9.4s, v28.4s\n"
+ "smin v10.4s, v10.4s, v28.4s\n"
+ "smin v11.4s, v11.4s, v28.4s\n"
+ "smin v12.4s, v12.4s, v28.4s\n"
+ "smin v13.4s, v13.4s, v28.4s\n"
+ "smin v14.4s, v14.4s, v28.4s\n"
+ "smin v15.4s, v15.4s, v28.4s\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v23.4s, v23.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smax v8.4s, v8.4s, v28.4s\n"
+ "smax v9.4s, v9.4s, v28.4s\n"
+ "smax v10.4s, v10.4s, v28.4s\n"
+ "smax v11.4s, v11.4s, v28.4s\n"
+ "smax v12.4s, v12.4s, v28.4s\n"
+ "smax v13.4s, v13.4s, v28.4s\n"
+ "smax v14.4s, v14.4s, v28.4s\n"
+ "smax v15.4s, v15.4s, v28.4s\n"
+ "smax v16.4s, v16.4s, v28.4s\n"
+ "smax v17.4s, v17.4s, v28.4s\n"
+ "smax v18.4s, v18.4s, v28.4s\n"
+ "smax v19.4s, v19.4s, v28.4s\n"
+ "smax v20.4s, v20.4s, v28.4s\n"
+ "smax v21.4s, v21.4s, v28.4s\n"
+ "smax v22.4s, v22.4s, v28.4s\n"
+ "smax v23.4s, v23.4s, v28.4s\n"
+ "smax v24.4s, v24.4s, v28.4s\n"
+ "smax v25.4s, v25.4s, v28.4s\n"
+ "smax v26.4s, v26.4s, v28.4s\n"
+ "smax v27.4s, v27.4s, v28.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v29.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v28.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
"cmp x16, #0x10\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v8.16b, v8.16b, v29.16b\n"
+ "uzp1 v12.16b, v12.16b, v28.16b\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 134f\n"
"tbz x16, #3, 129f\n"
"str d8, [x17], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x16, #2, 127f\n"
"st1 { v8.s }[2], [x17], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x16, #1, 126f\n"
"st1 { v8.h }[6], [x17], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x16, #0, 133f\n"
"st1 { v8.b }[14], [x17]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 133f\n"
"126:" // Height 5: Partial direct writeback: partial_1_12
"tbz x16, #0, 133f\n"
"st1 { v8.b }[12], [x17]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 133f\n"
"127:" // Height 5: Partial direct writeback: partial_2_8
"tbz x16, #1, 128f\n"
"st1 { v8.h }[4], [x17], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x16, #0, 133f\n"
"st1 { v8.b }[10], [x17]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 133f\n"
"128:" // Height 5: Partial direct writeback: partial_1_8
"tbz x16, #0, 133f\n"
"st1 { v8.b }[8], [x17]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 133f\n"
"129:" // Height 5: Partial direct writeback: partial_4_0
"tbz x16, #2, 131f\n"
"str s8, [x17], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x16, #1, 130f\n"
"st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x16, #0, 133f\n"
"st1 { v8.b }[6], [x17]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 133f\n"
"130:" // Height 5: Partial direct writeback: partial_1_4
"tbz x16, #0, 133f\n"
"st1 { v8.b }[4], [x17]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 133f\n"
"131:" // Height 5: Partial direct writeback: partial_2_0
"tbz x16, #1, 132f\n"
"str h8, [x17], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x16, #0, 133f\n"
"st1 { v8.b }[2], [x17]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 133f\n"
"132:" // Height 5: Partial direct writeback: partial_1_0
"str b8, [x17, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"133:" // Height 5: Partial direct writeback: Done
"b 135f\n"
"134:" // Height 5: Full writeback
"str q8, [x17, #0x0]\n"
"add x17, x17, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"135:" // Height 5: Writeback done
"subs x16, x16, #0x10\n"
"bgt 110b\n"
@@ -2888,191 +2887,191 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"139:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w13, [x20, x14, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 140f\n"
- "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x12, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x11, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x9, [x20, #0x18]\n"
+ "ldr x28, [x20, #0x20]\n"
+ "ldr x27, [x20, #0x28]\n"
"cbnz x14, 141f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
+ "add x28, x28, x20\n"
"add x27, x27, x20\n"
- "add x25, x25, x20\n"
- "add x23, x23, x20\n"
- "add x21, x21, x20\n"
"b 141f\n"
"140:" // Height 6: setup direct input
"mov x12, %x[input_ptr]\n"
- "add x9, x12, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
- "add x23, x25, x20\n"
- "add x21, x23, x20\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
+ "add x28, x9, x21\n"
+ "add x27, x28, x21\n"
"141:" // Height 6: input setup done
"cmp x13, #0x10\n"
"blt 144f\n"
"ldr q0, [x12, #0x0]\n"
"cmp x13, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x21, #0x0]\n"
+ "ldr q1, [x11, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "ldr q5, [x27, #0x0]\n"
"ldr q6, [x15, #0x0]\n"
"ldr q7, [x15, #0x10]\n"
"blt 143f\n"
"142:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x20, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
"ldr d6, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x48]\n"
+ "ldr x21, [x15, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
"ldr d7, [x15, #0x30]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr x10, [x12, #0x8]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x25, [x11, #0x8]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr x24, [x10, #0x8]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
"ldr d6, [x15, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x20, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x68]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr x24, [x25, #0x8]\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
"cmp x13, #0x20\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
"ldr d7, [x15, #0x50]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
"ldr d6, [x15, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0x88]\n"
+ "ldr x21, [x15, #0x88]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
"ldr d7, [x15, #0x70]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x15, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
"ldr d6, [x15, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x20, [x15, #0xa8]\n"
+ "ldr x21, [x15, #0xa8]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
"ldr d7, [x15, #0x90]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
"ldr d6, [x15, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xc8]\n"
+ "ldr x21, [x15, #0xc8]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
"ldr d7, [x15, #0xb0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x15, #0xd8]\n"
+ "ldr x20, [x15, #0xd8]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
"ldr d6, [x15, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x20, [x15, #0xe8]\n"
+ "ldr x21, [x15, #0xe8]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
"ldr d7, [x15, #0xd0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x15, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
"ldr d6, [x15, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x20\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr x22, [x23, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
"ldr d7, [x15, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"add x15, x15, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
"ldr x20, [x15, #0x8]\n"
@@ -3085,58 +3084,58 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
"ldr d0, [x12, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x11, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x10, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ "ldr d3, [x9, #0x0]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ "ldr d4, [x28, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
- "ldr d5, [x21, #0x0]\n"
+ "ldr d5, [x27, #0x0]\n"
"ldr d7, [x15, #0x10]\n"
"mov v6.d[1], x20\n"
- "ldr x20, [x21, #0x8]\n"
- "mov v0.d[1], x10\n"
- "ldr x11, [x15, #0x18]\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
+ "ldr x21, [x27, #0x8]\n"
+ "mov v0.d[1], x26\n"
+ "ldr x20, [x15, #0x18]\n"
+ "mov v1.d[1], x25\n"
+ "mov v2.d[1], x24\n"
+ "mov v3.d[1], x23\n"
"mov v4.d[1], x22\n"
- "mov v5.d[1], x20\n"
- "mov v7.d[1], x11\n"
+ "mov v5.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"bge 142b\n"
"143:" // Height 6: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x12, x12, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
"ldr q6, [x15, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x13, x13, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
"ldr q7, [x15, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
@@ -3236,143 +3235,143 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"cmp x13, #0x4\n"
"blt 146f\n"
"145:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x12], #0x4\n"
+ "ldr s7, [x12], #0x4\n"
"sub x13, x13, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s6, [x11], #0x4\n"
"cmp x13, #0x4\n"
+ "ldr s5, [x10], #0x4\n"
+ "ldr s4, [x9], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
"ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q1, [x15, #0x0]\n"
+ ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
+ "ldr q1, [x15, #0x20]\n"
+ ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
+ "ldr q0, [x15, #0x30]\n"
+ ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n"
+ ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n"
"bge 145b\n"
"146:" // Height 6: Multiply loop: Skip odd blocks
"cbz x13, 149f\n"
"tbz x13, #1, 147f\n"
"ldr h0, [x12], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h5, [x21], #0x2\n"
+ "ldr h1, [x11], #0x2\n"
+ "ldr h2, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
+ "ldr h4, [x28], #0x2\n"
+ "ldr h5, [x27], #0x2\n"
"tbz x13, #0, 148f\n"
"ld1 { v0.b }[2], [x12]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v5.b }[2], [x21]\n"
+ "ld1 { v1.b }[2], [x11]\n"
+ "ld1 { v2.b }[2], [x10]\n"
+ "ld1 { v3.b }[2], [x9]\n"
+ "ld1 { v4.b }[2], [x28]\n"
+ "ld1 { v5.b }[2], [x27]\n"
"b 148f\n"
"147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x12, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
- "ldr b5, [x21, #0x0]\n"
+ "ldr b1, [x11, #0x0]\n"
+ "ldr b2, [x10, #0x0]\n"
+ "ldr b3, [x9, #0x0]\n"
+ "ldr b4, [x28, #0x0]\n"
+ "ldr b5, [x27, #0x0]\n"
"148:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x15, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x15, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x15, #0x0]\n"
+ ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x15, #0x10]\n"
+ ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x15, #0x20]\n"
+ ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x15, #0x30]\n"
+ ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
"add x15, x15, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n"
"149:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x14, x14, #0x1\n"
"cmp x14, x20\n"
"bne 139b\n"
- "ldr q0, [x6, #0x0]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x6, #0x10]\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x6, #0x20]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "ldr q3, [x6, #0x30]\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q3, [x6, #0x0]\n"
+ "add v8.4s, v8.4s, v3.4s\n"
+ "ldr q2, [x6, #0x10]\n"
+ "add v9.4s, v9.4s, v2.4s\n"
+ "ldr q1, [x6, #0x20]\n"
+ "add v10.4s, v10.4s, v1.4s\n"
+ "ldr q0, [x6, #0x30]\n"
+ "add v11.4s, v11.4s, v0.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x17, x20\n"
+ "add x25, x17, x20\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
"add x22, x23, x20\n"
"add x21, x22, x20\n"
- "add x20, x21, x20\n"
"prfm pstl1keep, [x17, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v3.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v13.4s, v13.4s, v2.4s\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v1.4s\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v0.4s\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "add v15.4s, v15.4s, v3.4s\n"
+ "add v16.4s, v16.4s, v3.4s\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "prfm pstl1keep, [x20, #0x0]\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v29.4s, v29.4s, v1.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v2.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v2.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v2.4s\n"
+ "add v30.4s, v30.4s, v1.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
"add x6, x6, #0x40\n"
"tbz %x[flags], #4, 150f\n"
"ldr q0, [x8, #0x0]\n"
@@ -3387,10 +3386,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"add x7, x7, #0x40\n"
"b 151f\n"
"150:" // Height 6: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -3423,78 +3422,78 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"sqrdmulh v30.4s, v30.4s, v6.4s\n"
"sqrdmulh v31.4s, v31.4s, v7.4s\n"
"tbz %x[flags], #5, 152f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v8.16b, v0.16b\n"
+ "and v6.16b, v9.16b, v1.16b\n"
+ "and v5.16b, v10.16b, v2.16b\n"
+ "and v4.16b, v11.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v20.16b, v0.16b\n"
- "and v5.16b, v21.16b, v1.16b\n"
- "and v6.16b, v22.16b, v2.16b\n"
- "and v7.16b, v23.16b, v3.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v7.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v5.4s\n"
+ "sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v7.16b, v12.16b, v0.16b\n"
+ "and v6.16b, v13.16b, v1.16b\n"
+ "and v5.16b, v14.16b, v2.16b\n"
+ "and v4.16b, v15.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v7.4s\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v5.16b, v25.16b, v1.16b\n"
- "and v6.16b, v26.16b, v2.16b\n"
- "and v7.16b, v27.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v7.4s\n"
+ "sqadd v13.4s, v13.4s, v6.4s\n"
+ "sqadd v14.4s, v14.4s, v5.4s\n"
+ "sqadd v15.4s, v15.4s, v4.4s\n"
+ "and v7.16b, v16.16b, v0.16b\n"
+ "and v6.16b, v17.16b, v1.16b\n"
+ "and v5.16b, v18.16b, v2.16b\n"
+ "and v4.16b, v19.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v7.4s\n"
- "and v4.16b, v28.16b, v0.16b\n"
- "and v5.16b, v29.16b, v1.16b\n"
- "and v6.16b, v30.16b, v2.16b\n"
- "and v7.16b, v31.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v7.4s\n"
+ "sqadd v17.4s, v17.4s, v6.4s\n"
+ "sqadd v18.4s, v18.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v7.16b, v20.16b, v0.16b\n"
+ "and v6.16b, v21.16b, v1.16b\n"
+ "and v5.16b, v22.16b, v2.16b\n"
+ "and v4.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v7.4s\n"
+ "sqadd v21.4s, v21.4s, v6.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v7.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v1.16b\n"
+ "and v5.16b, v26.16b, v2.16b\n"
+ "and v4.16b, v27.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v7.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
+ "and v6.16b, v29.16b, v1.16b\n"
+ "and v5.16b, v30.16b, v2.16b\n"
+ "and v4.16b, v31.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v4.4s\n"
- "sqadd v29.4s, v29.4s, v5.4s\n"
- "sqadd v30.4s, v30.4s, v6.4s\n"
- "sqadd v31.4s, v31.4s, v7.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v7.4s\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "sqadd v30.4s, v30.4s, v5.4s\n"
+ "sqadd v31.4s, v31.4s, v4.4s\n"
"152:" // Height 6: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
@@ -3520,232 +3519,232 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"srshl v29.4s, v29.4s, v1.4s\n"
"srshl v30.4s, v30.4s, v2.4s\n"
"srshl v31.4s, v31.4s, v3.4s\n"
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v0.4s\n"
+ "add v10.4s, v10.4s, v0.4s\n"
+ "add v11.4s, v11.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v13.4s, v13.4s, v0.4s\n"
+ "add v14.4s, v14.4s, v0.4s\n"
+ "add v15.4s, v15.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v0.4s\n"
+ "add v18.4s, v18.4s, v0.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v0.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smin v8.4s, v8.4s, v0.4s\n"
+ "smin v9.4s, v9.4s, v0.4s\n"
+ "smin v10.4s, v10.4s, v0.4s\n"
+ "smin v11.4s, v11.4s, v0.4s\n"
+ "smin v12.4s, v12.4s, v0.4s\n"
+ "smin v13.4s, v13.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v0.4s\n"
+ "smin v15.4s, v15.4s, v0.4s\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smax v8.4s, v8.4s, v0.4s\n"
+ "smax v9.4s, v9.4s, v0.4s\n"
+ "smax v10.4s, v10.4s, v0.4s\n"
+ "smax v11.4s, v11.4s, v0.4s\n"
+ "smax v12.4s, v12.4s, v0.4s\n"
+ "smax v13.4s, v13.4s, v0.4s\n"
+ "smax v14.4s, v14.4s, v0.4s\n"
+ "smax v15.4s, v15.4s, v0.4s\n"
+ "smax v16.4s, v16.4s, v0.4s\n"
+ "smax v17.4s, v17.4s, v0.4s\n"
+ "smax v18.4s, v18.4s, v0.4s\n"
+ "smax v19.4s, v19.4s, v0.4s\n"
+ "smax v20.4s, v20.4s, v0.4s\n"
+ "smax v21.4s, v21.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v0.4s\n"
+ "smax v23.4s, v23.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v0.4s\n"
+ "smax v25.4s, v25.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v0.4s\n"
+ "smax v27.4s, v27.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v0.4s\n"
+ "smax v29.4s, v29.4s, v0.4s\n"
+ "smax v30.4s, v30.4s, v0.4s\n"
+ "smax v31.4s, v31.4s, v0.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v2.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v1.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v0.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v19.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v17.8h, v30.8h, v31.8h\n"
"cmp x16, #0x10\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
- "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "uzp1 v8.16b, v8.16b, v2.16b\n"
+ "uzp1 v12.16b, v12.16b, v1.16b\n"
+ "uzp1 v16.16b, v16.16b, v0.16b\n"
+ "uzp1 v20.16b, v20.16b, v19.16b\n"
+ "uzp1 v24.16b, v24.16b, v18.16b\n"
+ "uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 161f\n"
"tbz x16, #3, 156f\n"
"str d8, [x17], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x16, #2, 154f\n"
"st1 { v8.s }[2], [x17], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "st1 { v28.s }[2], [x20], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x16, #1, 153f\n"
"st1 { v8.h }[6], [x17], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "st1 { v28.h }[6], [x20], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
"tbz x16, #0, 160f\n"
"st1 { v8.b }[14], [x17]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
- "st1 { v28.b }[14], [x20]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 160f\n"
"153:" // Height 6: Partial direct writeback: partial_1_12
"tbz x16, #0, 160f\n"
"st1 { v8.b }[12], [x17]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
- "st1 { v28.b }[12], [x20]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 160f\n"
"154:" // Height 6: Partial direct writeback: partial_2_8
"tbz x16, #1, 155f\n"
"st1 { v8.h }[4], [x17], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "st1 { v28.h }[4], [x20], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
"tbz x16, #0, 160f\n"
"st1 { v8.b }[10], [x17]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
- "st1 { v28.b }[10], [x20]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 160f\n"
"155:" // Height 6: Partial direct writeback: partial_1_8
"tbz x16, #0, 160f\n"
"st1 { v8.b }[8], [x17]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
- "st1 { v28.b }[8], [x20]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 160f\n"
"156:" // Height 6: Partial direct writeback: partial_4_0
"tbz x16, #2, 158f\n"
"str s8, [x17], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "str s28, [x20], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x16, #1, 157f\n"
"st1 { v8.h }[2], [x17], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "st1 { v28.h }[2], [x20], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
"tbz x16, #0, 160f\n"
"st1 { v8.b }[6], [x17]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
- "st1 { v28.b }[6], [x20]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 160f\n"
"157:" // Height 6: Partial direct writeback: partial_1_4
"tbz x16, #0, 160f\n"
"st1 { v8.b }[4], [x17]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
- "st1 { v28.b }[4], [x20]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 160f\n"
"158:" // Height 6: Partial direct writeback: partial_2_0
"tbz x16, #1, 159f\n"
"str h8, [x17], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "str h28, [x20], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
"tbz x16, #0, 160f\n"
"st1 { v8.b }[2], [x17]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
- "st1 { v28.b }[2], [x20]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 160f\n"
"159:" // Height 6: Partial direct writeback: partial_1_0
"str b8, [x17, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
- "str b28, [x20, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"160:" // Height 6: Partial direct writeback: Done
"b 162f\n"
"161:" // Height 6: Full writeback
"str q8, [x17, #0x0]\n"
"add x17, x17, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
- "str q28, [x20, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"162:" // Height 6: Writeback done
"subs x16, x16, #0x10\n"
"bgt 137b\n"
@@ -3761,7 +3760,6 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"164:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
index 598d1524e8..f3942328a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -85,7 +85,6 @@ void a64_hybrid_s8qs_dot_6x16 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 136f\n"
@@ -111,11 +110,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -132,37 +131,37 @@ void a64_hybrid_s8qs_dot_6x16 (
"blt 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x9, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x9, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
"cmp x27, #0x20\n"
"add x9, x9, #0x100\n"
@@ -172,37 +171,37 @@ void a64_hybrid_s8qs_dot_6x16 (
"bge 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x9, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x9, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x9, x9, #0x100\n"
"9:" // Height 1: Multiply loop: Main loop skip
@@ -210,17 +209,17 @@ void a64_hybrid_s8qs_dot_6x16 (
"cmp x27, #0x4\n"
"blt 11f\n"
"10:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr q6, [x9, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr q16, [x9, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
"sub x27, x27, #0x4\n"
- "ldr q7, [x9, #0x10]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
"cmp x27, #0x4\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
"add x9, x9, #0x40\n"
"bge 10b\n"
"11:" // Height 1: Multiply loop: Skip odd blocks
@@ -233,28 +232,28 @@ void a64_hybrid_s8qs_dot_6x16 (
"12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x26, #0x0]\n"
"13:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
+ ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q17, [x9, #0x20]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"add x9, x9, #0x40\n"
"14:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 4b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q16, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v17.4s\n"
+ "add v9.4s, v9.4s, v16.4s\n"
+ "ldr q17, [x14, #0x20]\n"
+ "ldr q16, [x14, #0x30]\n"
+ "add v10.4s, v10.4s, v17.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"prfm pstl1keep, [x11, #0x0]\n"
"add x14, x14, #0x40\n"
"tbz %x[flags], #4, 15f\n"
@@ -270,10 +269,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 16f\n"
"15:" // Height 1: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -286,45 +285,45 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
"sqrdmulh v11.4s, v11.4s, v7.4s\n"
"tbz %x[flags], #5, 17f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v19.16b, v8.16b, v0.16b\n"
+ "and v18.16b, v9.16b, v1.16b\n"
+ "and v17.16b, v10.16b, v2.16b\n"
+ "and v16.16b, v11.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v19.4s\n"
+ "sqadd v9.4s, v9.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
"17:" // Height 1: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add v8.4s, v8.4s, v18.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v10.4s, v10.4s, v18.4s\n"
+ "add v11.4s, v11.4s, v18.4s\n"
"cmp x10, #0x10\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
+ "smin v8.4s, v8.4s, v17.4s\n"
+ "smin v9.4s, v9.4s, v17.4s\n"
+ "smin v10.4s, v10.4s, v17.4s\n"
+ "smin v11.4s, v11.4s, v17.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v16.8h, v10.8h, v11.8h\n"
+ "uzp1 v8.16b, v8.16b, v16.16b\n"
"bge 26f\n"
"tbz x10, #3, 21f\n"
"str d8, [x11], #0x8\n"
@@ -399,12 +398,12 @@ void a64_hybrid_s8qs_dot_6x16 (
"31:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 32f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -412,7 +411,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"b 33f\n"
"32:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"33:" // Height 2: input setup done
"cmp x27, #0x10\n"
"blt 36f\n"
@@ -425,137 +424,137 @@ void a64_hybrid_s8qs_dot_6x16 (
"34:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
"sub x27, x27, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x9, #0x40]\n"
"add x25, x25, #0x10\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x9, #0x50]\n"
"cmp x27, #0x20\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x9, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x9, #0x70]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
"ldr q6, [x9, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x9, #0x10]\n"
"bge 34b\n"
"35:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q17, [x9, #0x20]\n"
"add x26, x26, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q16, [x9, #0x30]\n"
"add x25, x25, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x9, #0x40]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x9, #0x50]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x9, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x9, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"36:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 41f\n"
"cmp x27, #0x4\n"
"blt 38f\n"
"37:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
+ ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
+ ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
+ ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
"bge 37b\n"
"38:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 41f\n"
@@ -570,41 +569,41 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b0, [x26, #0x0]\n"
"ldr b1, [x25, #0x0]\n"
"40:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
+ ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"41:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 31b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q19, [x14, #0x0]\n"
+ "ldr q18, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "ldr q17, [x14, #0x20]\n"
+ "ldr q16, [x14, #0x30]\n"
+ "add v10.4s, v10.4s, v17.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
+ "add x25, x11, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
+ "add v12.4s, v12.4s, v19.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add v13.4s, v13.4s, v18.4s\n"
+ "add v14.4s, v14.4s, v17.4s\n"
"add x14, x14, #0x40\n"
- "add v15.4s, v15.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v16.4s\n"
"tbz %x[flags], #4, 42f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -618,10 +617,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 43f\n"
"42:" // Height 2: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -638,141 +637,141 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v14.4s, v14.4s, v6.4s\n"
"sqrdmulh v15.4s, v15.4s, v7.4s\n"
"tbz %x[flags], #5, 44f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v19.16b, v8.16b, v0.16b\n"
+ "and v18.16b, v9.16b, v1.16b\n"
+ "and v17.16b, v10.16b, v2.16b\n"
+ "and v16.16b, v11.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v19.4s\n"
+ "sqadd v9.4s, v9.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
+ "and v19.16b, v12.16b, v0.16b\n"
+ "and v18.16b, v13.16b, v1.16b\n"
+ "and v17.16b, v14.16b, v2.16b\n"
+ "and v16.16b, v15.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v19.4s\n"
+ "sqadd v13.4s, v13.4s, v18.4s\n"
+ "sqadd v14.4s, v14.4s, v17.4s\n"
+ "sqadd v15.4s, v15.4s, v16.4s\n"
"44:" // Height 2: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
"cmp x10, #0x10\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
+ "add v8.4s, v8.4s, v18.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "add v10.4s, v10.4s, v18.4s\n"
+ "add v11.4s, v11.4s, v18.4s\n"
+ "add v12.4s, v12.4s, v18.4s\n"
+ "add v13.4s, v13.4s, v18.4s\n"
+ "add v14.4s, v14.4s, v18.4s\n"
+ "add v15.4s, v15.4s, v18.4s\n"
+ "smin v8.4s, v8.4s, v17.4s\n"
+ "smin v9.4s, v9.4s, v17.4s\n"
+ "smin v10.4s, v10.4s, v17.4s\n"
+ "smin v11.4s, v11.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v17.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.8h, v14.8h, v15.8h\n"
+ "uzp1 v8.16b, v8.16b, v17.16b\n"
+ "uzp1 v12.16b, v12.16b, v16.16b\n"
"bge 53f\n"
"tbz x10, #3, 48f\n"
"str d8, [x11], #0x8\n"
- "str d12, [x24], #0x8\n"
+ "str d12, [x25], #0x8\n"
"tbz x10, #2, 46f\n"
"st1 { v8.s }[2], [x11], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
"tbz x10, #1, 45f\n"
"st1 { v8.h }[6], [x11], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
"tbz x10, #0, 52f\n"
"st1 { v8.b }[14], [x11]\n"
- "st1 { v12.b }[14], [x24]\n"
+ "st1 { v12.b }[14], [x25]\n"
"b 52f\n"
"45:" // Height 2: Partial direct writeback: partial_1_12
"tbz x10, #0, 52f\n"
"st1 { v8.b }[12], [x11]\n"
- "st1 { v12.b }[12], [x24]\n"
+ "st1 { v12.b }[12], [x25]\n"
"b 52f\n"
"46:" // Height 2: Partial direct writeback: partial_2_8
"tbz x10, #1, 47f\n"
"st1 { v8.h }[4], [x11], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
"tbz x10, #0, 52f\n"
"st1 { v8.b }[10], [x11]\n"
- "st1 { v12.b }[10], [x24]\n"
+ "st1 { v12.b }[10], [x25]\n"
"b 52f\n"
"47:" // Height 2: Partial direct writeback: partial_1_8
"tbz x10, #0, 52f\n"
"st1 { v8.b }[8], [x11]\n"
- "st1 { v12.b }[8], [x24]\n"
+ "st1 { v12.b }[8], [x25]\n"
"b 52f\n"
"48:" // Height 2: Partial direct writeback: partial_4_0
"tbz x10, #2, 50f\n"
"str s8, [x11], #0x4\n"
- "str s12, [x24], #0x4\n"
+ "str s12, [x25], #0x4\n"
"tbz x10, #1, 49f\n"
"st1 { v8.h }[2], [x11], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
"tbz x10, #0, 52f\n"
"st1 { v8.b }[6], [x11]\n"
- "st1 { v12.b }[6], [x24]\n"
+ "st1 { v12.b }[6], [x25]\n"
"b 52f\n"
"49:" // Height 2: Partial direct writeback: partial_1_4
"tbz x10, #0, 52f\n"
"st1 { v8.b }[4], [x11]\n"
- "st1 { v12.b }[4], [x24]\n"
+ "st1 { v12.b }[4], [x25]\n"
"b 52f\n"
"50:" // Height 2: Partial direct writeback: partial_2_0
"tbz x10, #1, 51f\n"
"str h8, [x11], #0x2\n"
- "str h12, [x24], #0x2\n"
+ "str h12, [x25], #0x2\n"
"tbz x10, #0, 52f\n"
"st1 { v8.b }[2], [x11]\n"
- "st1 { v12.b }[2], [x24]\n"
+ "st1 { v12.b }[2], [x25]\n"
"b 52f\n"
"51:" // Height 2: Partial direct writeback: partial_1_0
"str b8, [x11, #0x0]\n"
- "str b12, [x24, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
"52:" // Height 2: Partial direct writeback: Done
"b 54f\n"
"53:" // Height 2: Full writeback
"str q8, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q12, [x24, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
"54:" // Height 2: Writeback done
"subs x10, x10, #0x10\n"
"bgt 29b\n"
@@ -802,13 +801,13 @@ void a64_hybrid_s8qs_dot_6x16 (
"58:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -817,8 +816,8 @@ void a64_hybrid_s8qs_dot_6x16 (
"b 60f\n"
"59:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"60:" // Height 3: input setup done
"cmp x27, #0x10\n"
"blt 63f\n"
@@ -835,75 +834,75 @@ void a64_hybrid_s8qs_dot_6x16 (
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q21, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x25, x25, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q20, [x9, #0x30]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
"cmp x27, #0x20\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x9, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x9, #0x50]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x9, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x9, #0x70]\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x9, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x9, #0x90]\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x9, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x9, #0xb0]\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x9, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x9, #0xd0]\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x9, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
"ldr q6, [x9, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x9, #0x10]\n"
"bge 61b\n"
@@ -913,98 +912,98 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q21, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x24, x24, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q20, [x9, #0x30]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x9, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x9, #0x50]\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x9, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x9, #0x70]\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x9, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x9, #0x90]\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x9, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x9, #0xb0]\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x9, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x9, #0xd0]\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x9, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"63:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 68f\n"
"cmp x27, #0x4\n"
"blt 65f\n"
"64:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr q6, [x9, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x9, #0x0]\n"
+ ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
+ ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
+ "ldr q20, [x9, #0x10]\n"
+ ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
+ "ldr q21, [x9, #0x20]\n"
+ ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
+ "ldr q20, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
+ ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
+ ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n"
"bge 64b\n"
"65:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 68f\n"
@@ -1022,51 +1021,51 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b1, [x25, #0x0]\n"
"ldr b2, [x24, #0x0]\n"
"67:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q21, [x9, #0x0]\n"
+ "ldr q20, [x9, #0x10]\n"
+ ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x9, #0x20]\n"
+ ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
"68:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 58b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q23, [x14, #0x0]\n"
+ "ldr q22, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v23.4s\n"
+ "add v9.4s, v9.4s, v22.4s\n"
+ "ldr q21, [x14, #0x20]\n"
+ "ldr q20, [x14, #0x30]\n"
+ "add v10.4s, v10.4s, v21.4s\n"
+ "add v11.4s, v11.4s, v20.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
+ "add v12.4s, v12.4s, v23.4s\n"
+ "add v13.4s, v13.4s, v22.4s\n"
+ "add v14.4s, v14.4s, v21.4s\n"
+ "add v15.4s, v15.4s, v20.4s\n"
"add x14, x14, #0x40\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v16.4s, v16.4s, v23.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
"tbz %x[flags], #4, 69f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -1080,10 +1079,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 70f\n"
"69:" // Height 3: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -1104,55 +1103,55 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v18.4s, v18.4s, v6.4s\n"
"sqrdmulh v19.4s, v19.4s, v7.4s\n"
"tbz %x[flags], #5, 71f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v23.16b, v8.16b, v0.16b\n"
+ "and v22.16b, v9.16b, v1.16b\n"
+ "and v21.16b, v10.16b, v2.16b\n"
+ "and v20.16b, v11.16b, v3.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sqadd v9.4s, v9.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sqadd v11.4s, v11.4s, v20.4s\n"
+ "and v23.16b, v12.16b, v0.16b\n"
+ "and v22.16b, v13.16b, v1.16b\n"
+ "and v21.16b, v14.16b, v2.16b\n"
+ "and v20.16b, v15.16b, v3.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v23.4s\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sqadd v15.4s, v15.4s, v20.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v1.16b\n"
+ "and v21.16b, v18.16b, v2.16b\n"
+ "and v20.16b, v19.16b, v3.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"71:" // Height 3: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -1160,132 +1159,132 @@ void a64_hybrid_s8qs_dot_6x16 (
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
"srshl v19.4s, v19.4s, v3.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add v8.4s, v8.4s, v22.4s\n"
+ "add v9.4s, v9.4s, v22.4s\n"
+ "add v10.4s, v10.4s, v22.4s\n"
+ "add v11.4s, v11.4s, v22.4s\n"
+ "add v12.4s, v12.4s, v22.4s\n"
+ "add v13.4s, v13.4s, v22.4s\n"
+ "add v14.4s, v14.4s, v22.4s\n"
+ "add v15.4s, v15.4s, v22.4s\n"
+ "add v16.4s, v16.4s, v22.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v22.4s\n"
+ "add v19.4s, v19.4s, v22.4s\n"
+ "smin v8.4s, v8.4s, v21.4s\n"
+ "smin v9.4s, v9.4s, v21.4s\n"
+ "smin v10.4s, v10.4s, v21.4s\n"
+ "smin v11.4s, v11.4s, v21.4s\n"
+ "smin v12.4s, v12.4s, v21.4s\n"
+ "smin v13.4s, v13.4s, v21.4s\n"
+ "smin v14.4s, v14.4s, v21.4s\n"
+ "smin v15.4s, v15.4s, v21.4s\n"
+ "smin v16.4s, v16.4s, v21.4s\n"
+ "smin v17.4s, v17.4s, v21.4s\n"
+ "smin v18.4s, v18.4s, v21.4s\n"
+ "smin v19.4s, v19.4s, v21.4s\n"
+ "smax v8.4s, v8.4s, v20.4s\n"
+ "smax v9.4s, v9.4s, v20.4s\n"
+ "smax v10.4s, v10.4s, v20.4s\n"
+ "smax v11.4s, v11.4s, v20.4s\n"
+ "smax v12.4s, v12.4s, v20.4s\n"
+ "smax v13.4s, v13.4s, v20.4s\n"
+ "smax v14.4s, v14.4s, v20.4s\n"
+ "smax v15.4s, v15.4s, v20.4s\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v21.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v20.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v8.16b, v8.16b, v21.16b\n"
+ "uzp1 v12.16b, v12.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 80f\n"
"tbz x10, #3, 75f\n"
"str d8, [x11], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x10, #2, 73f\n"
"st1 { v8.s }[2], [x11], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
"tbz x10, #1, 72f\n"
"st1 { v8.h }[6], [x11], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
"tbz x10, #0, 79f\n"
"st1 { v8.b }[14], [x11]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
"b 79f\n"
"72:" // Height 3: Partial direct writeback: partial_1_12
"tbz x10, #0, 79f\n"
"st1 { v8.b }[12], [x11]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
"b 79f\n"
"73:" // Height 3: Partial direct writeback: partial_2_8
"tbz x10, #1, 74f\n"
"st1 { v8.h }[4], [x11], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
"tbz x10, #0, 79f\n"
"st1 { v8.b }[10], [x11]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
"b 79f\n"
"74:" // Height 3: Partial direct writeback: partial_1_8
"tbz x10, #0, 79f\n"
"st1 { v8.b }[8], [x11]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
"b 79f\n"
"75:" // Height 3: Partial direct writeback: partial_4_0
"tbz x10, #2, 77f\n"
"str s8, [x11], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
"tbz x10, #1, 76f\n"
"st1 { v8.h }[2], [x11], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
"tbz x10, #0, 79f\n"
"st1 { v8.b }[6], [x11]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
"b 79f\n"
"76:" // Height 3: Partial direct writeback: partial_1_4
"tbz x10, #0, 79f\n"
"st1 { v8.b }[4], [x11]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
"b 79f\n"
"77:" // Height 3: Partial direct writeback: partial_2_0
"tbz x10, #1, 78f\n"
"str h8, [x11], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
"tbz x10, #0, 79f\n"
"st1 { v8.b }[2], [x11]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
"b 79f\n"
"78:" // Height 3: Partial direct writeback: partial_1_0
"str b8, [x11, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
"79:" // Height 3: Partial direct writeback: Done
"b 81f\n"
"80:" // Height 3: Full writeback
"str q8, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
"81:" // Height 3: Writeback done
"subs x10, x10, #0x10\n"
"bgt 56b\n"
@@ -1319,14 +1318,14 @@ void a64_hybrid_s8qs_dot_6x16 (
"85:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 86f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 87f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1336,9 +1335,9 @@ void a64_hybrid_s8qs_dot_6x16 (
"b 87f\n"
"86:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"87:" // Height 4: input setup done
"cmp x27, #0x10\n"
"blt 90f\n"
@@ -1357,7 +1356,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x26, x26, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
"add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1365,85 +1364,85 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x23, x23, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
"cmp x27, #0x20\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x9, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x9, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x9, #0x90]\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x9, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x9, #0xb0]\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x9, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x9, #0xd0]\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x9, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
"ldr q6, [x9, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x9, #0x10]\n"
"bge 88b\n"
@@ -1454,7 +1453,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x25, x25, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q25, [x9, #0x20]\n"
"add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1462,112 +1461,112 @@ void a64_hybrid_s8qs_dot_6x16 (
"sub x27, x27, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q24, [x9, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x9, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x9, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x9, #0x90]\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x9, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x9, #0xb0]\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x9, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x9, #0xd0]\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x9, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"90:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 95f\n"
"cmp x27, #0x4\n"
"blt 92f\n"
"91:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
+ ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
+ "ldr q24, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n"
+ ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n"
"bge 91b\n"
"92:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 95f\n"
@@ -1588,61 +1587,61 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b2, [x24, #0x0]\n"
"ldr b3, [x23, #0x0]\n"
"94:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
+ ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
"95:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 85b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q27, [x14, #0x0]\n"
+ "ldr q26, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v27.4s\n"
+ "add v9.4s, v9.4s, v26.4s\n"
+ "ldr q25, [x14, #0x20]\n"
+ "ldr q24, [x14, #0x30]\n"
+ "add v10.4s, v10.4s, v25.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x20\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "add v12.4s, v12.4s, v27.4s\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x22, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
+ "add v13.4s, v13.4s, v26.4s\n"
+ "add v14.4s, v14.4s, v25.4s\n"
"add x14, x14, #0x40\n"
- "add v15.4s, v15.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v24.4s\n"
+ "add v16.4s, v16.4s, v27.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
"tbz %x[flags], #4, 96f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -1656,10 +1655,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 97f\n"
"96:" // Height 4: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -1684,67 +1683,67 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v22.4s, v22.4s, v6.4s\n"
"sqrdmulh v23.4s, v23.4s, v7.4s\n"
"tbz %x[flags], #5, 98f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v20.16b, v0.16b\n"
- "and v5.16b, v21.16b, v1.16b\n"
- "and v6.16b, v22.16b, v2.16b\n"
- "and v7.16b, v23.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v7.4s\n"
+ "and v27.16b, v8.16b, v0.16b\n"
+ "and v26.16b, v9.16b, v1.16b\n"
+ "and v25.16b, v10.16b, v2.16b\n"
+ "and v24.16b, v11.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "sqadd v9.4s, v9.4s, v26.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v24.4s\n"
+ "and v27.16b, v12.16b, v0.16b\n"
+ "and v26.16b, v13.16b, v1.16b\n"
+ "and v25.16b, v14.16b, v2.16b\n"
+ "and v24.16b, v15.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v27.4s\n"
+ "sqadd v13.4s, v13.4s, v26.4s\n"
+ "sqadd v14.4s, v14.4s, v25.4s\n"
+ "sqadd v15.4s, v15.4s, v24.4s\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v1.16b\n"
+ "and v25.16b, v18.16b, v2.16b\n"
+ "and v24.16b, v19.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "sqadd v18.4s, v18.4s, v25.4s\n"
+ "sqadd v19.4s, v19.4s, v24.4s\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v1.16b\n"
+ "and v25.16b, v22.16b, v2.16b\n"
+ "and v24.16b, v23.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sqadd v21.4s, v21.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"98:" // Height 4: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -1756,163 +1755,163 @@ void a64_hybrid_s8qs_dot_6x16 (
"srshl v21.4s, v21.4s, v1.4s\n"
"srshl v22.4s, v22.4s, v2.4s\n"
"srshl v23.4s, v23.4s, v3.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
+ "add v8.4s, v8.4s, v26.4s\n"
+ "add v9.4s, v9.4s, v26.4s\n"
+ "add v10.4s, v10.4s, v26.4s\n"
+ "add v11.4s, v11.4s, v26.4s\n"
+ "add v12.4s, v12.4s, v26.4s\n"
+ "add v13.4s, v13.4s, v26.4s\n"
+ "add v14.4s, v14.4s, v26.4s\n"
+ "add v15.4s, v15.4s, v26.4s\n"
+ "add v16.4s, v16.4s, v26.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v26.4s\n"
+ "add v20.4s, v20.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v23.4s, v23.4s, v26.4s\n"
+ "smin v8.4s, v8.4s, v25.4s\n"
+ "smin v9.4s, v9.4s, v25.4s\n"
+ "smin v10.4s, v10.4s, v25.4s\n"
+ "smin v11.4s, v11.4s, v25.4s\n"
+ "smin v12.4s, v12.4s, v25.4s\n"
+ "smin v13.4s, v13.4s, v25.4s\n"
+ "smin v14.4s, v14.4s, v25.4s\n"
+ "smin v15.4s, v15.4s, v25.4s\n"
+ "smin v16.4s, v16.4s, v25.4s\n"
+ "smin v17.4s, v17.4s, v25.4s\n"
+ "smin v18.4s, v18.4s, v25.4s\n"
+ "smin v19.4s, v19.4s, v25.4s\n"
+ "smin v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v25.4s\n"
+ "smin v22.4s, v22.4s, v25.4s\n"
+ "smin v23.4s, v23.4s, v25.4s\n"
+ "smax v8.4s, v8.4s, v24.4s\n"
+ "smax v9.4s, v9.4s, v24.4s\n"
+ "smax v10.4s, v10.4s, v24.4s\n"
+ "smax v11.4s, v11.4s, v24.4s\n"
+ "smax v12.4s, v12.4s, v24.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smax v14.4s, v14.4s, v24.4s\n"
+ "smax v15.4s, v15.4s, v24.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v25.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v24.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v17.8h, v22.8h, v23.8h\n"
+ "uzp1 v8.16b, v8.16b, v25.16b\n"
+ "uzp1 v12.16b, v12.16b, v24.16b\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 107f\n"
"tbz x10, #3, 102f\n"
"str d8, [x11], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x10, #2, 100f\n"
"st1 { v8.s }[2], [x11], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x10, #1, 99f\n"
"st1 { v8.h }[6], [x11], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
"tbz x10, #0, 106f\n"
"st1 { v8.b }[14], [x11]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
- "st1 { v20.b }[14], [x22]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 106f\n"
"99:" // Height 4: Partial direct writeback: partial_1_12
"tbz x10, #0, 106f\n"
"st1 { v8.b }[12], [x11]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
- "st1 { v20.b }[12], [x22]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 106f\n"
"100:" // Height 4: Partial direct writeback: partial_2_8
"tbz x10, #1, 101f\n"
"st1 { v8.h }[4], [x11], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
"tbz x10, #0, 106f\n"
"st1 { v8.b }[10], [x11]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
- "st1 { v20.b }[10], [x22]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 106f\n"
"101:" // Height 4: Partial direct writeback: partial_1_8
"tbz x10, #0, 106f\n"
"st1 { v8.b }[8], [x11]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
- "st1 { v20.b }[8], [x22]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 106f\n"
"102:" // Height 4: Partial direct writeback: partial_4_0
"tbz x10, #2, 104f\n"
"str s8, [x11], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
- "str s20, [x22], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x10, #1, 103f\n"
"st1 { v8.h }[2], [x11], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
"tbz x10, #0, 106f\n"
"st1 { v8.b }[6], [x11]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
- "st1 { v20.b }[6], [x22]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 106f\n"
"103:" // Height 4: Partial direct writeback: partial_1_4
"tbz x10, #0, 106f\n"
"st1 { v8.b }[4], [x11]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
- "st1 { v20.b }[4], [x22]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 106f\n"
"104:" // Height 4: Partial direct writeback: partial_2_0
"tbz x10, #1, 105f\n"
"str h8, [x11], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
- "str h20, [x22], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
"tbz x10, #0, 106f\n"
"st1 { v8.b }[2], [x11]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
- "st1 { v20.b }[2], [x22]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 106f\n"
"105:" // Height 4: Partial direct writeback: partial_1_0
"str b8, [x11, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
- "str b20, [x22, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"106:" // Height 4: Partial direct writeback: Done
"b 108f\n"
"107:" // Height 4: Full writeback
"str q8, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
- "str q20, [x22, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"108:" // Height 4: Writeback done
"subs x10, x10, #0x10\n"
"bgt 83b\n"
@@ -1950,15 +1949,15 @@ void a64_hybrid_s8qs_dot_6x16 (
"112:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 113f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 114f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1969,10 +1968,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"b 114f\n"
"113:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"114:" // Height 5: input setup done
"cmp x27, #0x10\n"
"blt 117f\n"
@@ -1995,7 +1994,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q29, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x23, x23, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2004,100 +2003,100 @@ void a64_hybrid_s8qs_dot_6x16 (
"cmp x27, #0x20\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q28, [x9, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x9, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x9, #0x50]\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x9, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x9, #0x70]\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x9, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x9, #0x90]\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x9, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x9, #0xb0]\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x9, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x9, #0xd0]\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x9, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
"ldr q6, [x9, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x9, #0x10]\n"
"bge 115b\n"
@@ -2111,7 +2110,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
+ "ldr q29, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x22, x22, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2120,131 +2119,131 @@ void a64_hybrid_s8qs_dot_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q28, [x9, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x9, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x9, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x9, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x9, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x9, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x9, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x9, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x9, #0xf0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x9, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x9, #0x50]\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x9, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x9, #0x70]\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x9, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x9, #0x90]\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x9, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x9, #0xb0]\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x9, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x9, #0xd0]\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x9, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"117:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 122f\n"
"cmp x27, #0x4\n"
"blt 119f\n"
"118:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x9, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr s0, [x24], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q29, [x9, #0x0]\n"
+ ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ "ldr q28, [x9, #0x10]\n"
+ ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
+ "ldr q29, [x9, #0x20]\n"
+ ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
+ "ldr q28, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n"
+ ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n"
"bge 118b\n"
"119:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 122f\n"
@@ -2268,71 +2267,71 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b3, [x23, #0x0]\n"
"ldr b4, [x22, #0x0]\n"
"121:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q29, [x9, #0x0]\n"
+ "ldr q28, [x9, #0x10]\n"
+ ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x9, #0x20]\n"
+ ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
"122:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 112b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q31, [x14, #0x0]\n"
+ "ldr q30, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v31.4s\n"
+ "add v9.4s, v9.4s, v30.4s\n"
+ "ldr q29, [x14, #0x20]\n"
+ "ldr q28, [x14, #0x30]\n"
+ "add v10.4s, v10.4s, v29.4s\n"
+ "add v11.4s, v11.4s, v28.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
+ "add x23, x24, x20\n"
"add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
+ "add v12.4s, v12.4s, v31.4s\n"
+ "add v13.4s, v13.4s, v30.4s\n"
+ "add v14.4s, v14.4s, v29.4s\n"
+ "add v15.4s, v15.4s, v28.4s\n"
"add x14, x14, #0x40\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
+ "add v16.4s, v16.4s, v31.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v31.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
"tbz %x[flags], #4, 123f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -2346,10 +2345,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 124f\n"
"123:" // Height 5: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -2378,79 +2377,79 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v26.4s, v26.4s, v6.4s\n"
"sqrdmulh v27.4s, v27.4s, v7.4s\n"
"tbz %x[flags], #5, 125f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v20.16b, v0.16b\n"
- "and v5.16b, v21.16b, v1.16b\n"
- "and v6.16b, v22.16b, v2.16b\n"
- "and v7.16b, v23.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v7.4s\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v5.16b, v25.16b, v1.16b\n"
- "and v6.16b, v26.16b, v2.16b\n"
- "and v7.16b, v27.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v7.4s\n"
+ "and v31.16b, v8.16b, v0.16b\n"
+ "and v30.16b, v9.16b, v1.16b\n"
+ "and v29.16b, v10.16b, v2.16b\n"
+ "and v28.16b, v11.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v31.4s\n"
+ "sqadd v9.4s, v9.4s, v30.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "and v31.16b, v12.16b, v0.16b\n"
+ "and v30.16b, v13.16b, v1.16b\n"
+ "and v29.16b, v14.16b, v2.16b\n"
+ "and v28.16b, v15.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v31.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sqadd v14.4s, v14.4s, v29.4s\n"
+ "sqadd v15.4s, v15.4s, v28.4s\n"
+ "and v31.16b, v16.16b, v0.16b\n"
+ "and v30.16b, v17.16b, v1.16b\n"
+ "and v29.16b, v18.16b, v2.16b\n"
+ "and v28.16b, v19.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v31.4s\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v29.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "and v31.16b, v20.16b, v0.16b\n"
+ "and v30.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v22.16b, v2.16b\n"
+ "and v28.16b, v23.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v31.4s\n"
+ "sqadd v21.4s, v21.4s, v30.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sqadd v23.4s, v23.4s, v28.4s\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v1.16b\n"
+ "and v29.16b, v26.16b, v2.16b\n"
+ "and v28.16b, v27.16b, v3.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "sqadd v27.4s, v27.4s, v28.4s\n"
"125:" // Height 5: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -2466,194 +2465,194 @@ void a64_hybrid_s8qs_dot_6x16 (
"srshl v25.4s, v25.4s, v1.4s\n"
"srshl v26.4s, v26.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v8.4s, v8.4s, v30.4s\n"
+ "add v9.4s, v9.4s, v30.4s\n"
+ "add v10.4s, v10.4s, v30.4s\n"
+ "add v11.4s, v11.4s, v30.4s\n"
+ "add v12.4s, v12.4s, v30.4s\n"
+ "add v13.4s, v13.4s, v30.4s\n"
+ "add v14.4s, v14.4s, v30.4s\n"
+ "add v15.4s, v15.4s, v30.4s\n"
+ "add v16.4s, v16.4s, v30.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v30.4s\n"
+ "add v19.4s, v19.4s, v30.4s\n"
+ "add v20.4s, v20.4s, v30.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add v23.4s, v23.4s, v30.4s\n"
+ "add v24.4s, v24.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v30.4s\n"
+ "smin v8.4s, v8.4s, v29.4s\n"
+ "smin v9.4s, v9.4s, v29.4s\n"
+ "smin v10.4s, v10.4s, v29.4s\n"
+ "smin v11.4s, v11.4s, v29.4s\n"
+ "smin v12.4s, v12.4s, v29.4s\n"
+ "smin v13.4s, v13.4s, v29.4s\n"
+ "smin v14.4s, v14.4s, v29.4s\n"
+ "smin v15.4s, v15.4s, v29.4s\n"
+ "smin v16.4s, v16.4s, v29.4s\n"
+ "smin v17.4s, v17.4s, v29.4s\n"
+ "smin v18.4s, v18.4s, v29.4s\n"
+ "smin v19.4s, v19.4s, v29.4s\n"
+ "smin v20.4s, v20.4s, v29.4s\n"
+ "smin v21.4s, v21.4s, v29.4s\n"
+ "smin v22.4s, v22.4s, v29.4s\n"
+ "smin v23.4s, v23.4s, v29.4s\n"
+ "smin v24.4s, v24.4s, v29.4s\n"
+ "smin v25.4s, v25.4s, v29.4s\n"
+ "smin v26.4s, v26.4s, v29.4s\n"
+ "smin v27.4s, v27.4s, v29.4s\n"
+ "smax v8.4s, v8.4s, v28.4s\n"
+ "smax v9.4s, v9.4s, v28.4s\n"
+ "smax v10.4s, v10.4s, v28.4s\n"
+ "smax v11.4s, v11.4s, v28.4s\n"
+ "smax v12.4s, v12.4s, v28.4s\n"
+ "smax v13.4s, v13.4s, v28.4s\n"
+ "smax v14.4s, v14.4s, v28.4s\n"
+ "smax v15.4s, v15.4s, v28.4s\n"
+ "smax v16.4s, v16.4s, v28.4s\n"
+ "smax v17.4s, v17.4s, v28.4s\n"
+ "smax v18.4s, v18.4s, v28.4s\n"
+ "smax v19.4s, v19.4s, v28.4s\n"
+ "smax v20.4s, v20.4s, v28.4s\n"
+ "smax v21.4s, v21.4s, v28.4s\n"
+ "smax v22.4s, v22.4s, v28.4s\n"
+ "smax v23.4s, v23.4s, v28.4s\n"
+ "smax v24.4s, v24.4s, v28.4s\n"
+ "smax v25.4s, v25.4s, v28.4s\n"
+ "smax v26.4s, v26.4s, v28.4s\n"
+ "smax v27.4s, v27.4s, v28.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v29.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v28.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "uzp1 v8.16b, v8.16b, v29.16b\n"
+ "uzp1 v12.16b, v12.16b, v28.16b\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 134f\n"
"tbz x10, #3, 129f\n"
"str d8, [x11], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x10, #2, 127f\n"
"st1 { v8.s }[2], [x11], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x10, #1, 126f\n"
"st1 { v8.h }[6], [x11], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x10, #0, 133f\n"
"st1 { v8.b }[14], [x11]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 133f\n"
"126:" // Height 5: Partial direct writeback: partial_1_12
"tbz x10, #0, 133f\n"
"st1 { v8.b }[12], [x11]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 133f\n"
"127:" // Height 5: Partial direct writeback: partial_2_8
"tbz x10, #1, 128f\n"
"st1 { v8.h }[4], [x11], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x10, #0, 133f\n"
"st1 { v8.b }[10], [x11]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 133f\n"
"128:" // Height 5: Partial direct writeback: partial_1_8
"tbz x10, #0, 133f\n"
"st1 { v8.b }[8], [x11]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 133f\n"
"129:" // Height 5: Partial direct writeback: partial_4_0
"tbz x10, #2, 131f\n"
"str s8, [x11], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x10, #1, 130f\n"
"st1 { v8.h }[2], [x11], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x10, #0, 133f\n"
"st1 { v8.b }[6], [x11]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 133f\n"
"130:" // Height 5: Partial direct writeback: partial_1_4
"tbz x10, #0, 133f\n"
"st1 { v8.b }[4], [x11]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 133f\n"
"131:" // Height 5: Partial direct writeback: partial_2_0
"tbz x10, #1, 132f\n"
"str h8, [x11], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x10, #0, 133f\n"
"st1 { v8.b }[2], [x11]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 133f\n"
"132:" // Height 5: Partial direct writeback: partial_1_0
"str b8, [x11, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"133:" // Height 5: Partial direct writeback: Done
"b 135f\n"
"134:" // Height 5: Full writeback
"str q8, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"135:" // Height 5: Writeback done
"subs x10, x10, #0x10\n"
"bgt 110b\n"
@@ -2698,16 +2697,16 @@ void a64_hybrid_s8qs_dot_6x16 (
"139:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 140f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 141f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2719,11 +2718,11 @@ void a64_hybrid_s8qs_dot_6x16 (
"b 141f\n"
"140:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"141:" // Height 6: input setup done
"cmp x27, #0x10\n"
"blt 144f\n"
@@ -3002,43 +3001,43 @@ void a64_hybrid_s8qs_dot_6x16 (
"cmp x27, #0x4\n"
"blt 146f\n"
"145:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr q1, [x9, #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
+ ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
+ "ldr q1, [x9, #0x20]\n"
+ ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
+ "ldr q0, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
+ ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n"
+ ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n"
"bge 145b\n"
"146:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 149f\n"
@@ -3065,81 +3064,81 @@ void a64_hybrid_s8qs_dot_6x16 (
"ldr b4, [x22, #0x0]\n"
"ldr b5, [x21, #0x0]\n"
"148:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x9, #0x30]\n"
+ "ldr q7, [x9, #0x0]\n"
+ "ldr q6, [x9, #0x10]\n"
+ ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x9, #0x20]\n"
+ ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x9, #0x30]\n"
"add x9, x9, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n"
"149:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 139b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "ldr q3, [x14, #0x0]\n"
+ "ldr q2, [x14, #0x10]\n"
+ "add v8.4s, v8.4s, v3.4s\n"
+ "add v9.4s, v9.4s, v2.4s\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q0, [x14, #0x30]\n"
+ "add v10.4s, v10.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v0.4s\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
+ "add x23, x24, x20\n"
"add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
+ "add x21, x22, x20\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "add x20, x21, x20\n"
"prfm pstl1keep, [x22, #0x0]\n"
+ "add v12.4s, v12.4s, v3.4s\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x20, #0x0]\n"
- "add v13.4s, v13.4s, v1.4s\n"
- "add v14.4s, v14.4s, v2.4s\n"
+ "add v13.4s, v13.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v1.4s\n"
"add x14, x14, #0x40\n"
- "add v15.4s, v15.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v29.4s, v29.4s, v1.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v2.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v2.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v2.4s\n"
+ "add v30.4s, v30.4s, v1.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
"tbz %x[flags], #4, 150f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -3153,10 +3152,10 @@ void a64_hybrid_s8qs_dot_6x16 (
"add x13, x13, #0x40\n"
"b 151f\n"
"150:" // Height 6: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -3189,91 +3188,91 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v30.4s, v30.4s, v6.4s\n"
"sqrdmulh v31.4s, v31.4s, v7.4s\n"
"tbz %x[flags], #5, 152f\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v8.16b, v0.16b\n"
+ "and v6.16b, v9.16b, v1.16b\n"
+ "and v5.16b, v10.16b, v2.16b\n"
+ "and v4.16b, v11.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v12.16b, v0.16b\n"
- "and v5.16b, v13.16b, v1.16b\n"
- "and v6.16b, v14.16b, v2.16b\n"
- "and v7.16b, v15.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v12.4s, v12.4s, v4.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
- "sqadd v14.4s, v14.4s, v6.4s\n"
- "sqadd v15.4s, v15.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v20.16b, v0.16b\n"
- "and v5.16b, v21.16b, v1.16b\n"
- "and v6.16b, v22.16b, v2.16b\n"
- "and v7.16b, v23.16b, v3.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v7.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v5.4s\n"
+ "sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v7.16b, v12.16b, v0.16b\n"
+ "and v6.16b, v13.16b, v1.16b\n"
+ "and v5.16b, v14.16b, v2.16b\n"
+ "and v4.16b, v15.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "sqadd v22.4s, v22.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v7.4s\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v5.16b, v25.16b, v1.16b\n"
- "and v6.16b, v26.16b, v2.16b\n"
- "and v7.16b, v27.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v7.4s\n"
+ "sqadd v13.4s, v13.4s, v6.4s\n"
+ "sqadd v14.4s, v14.4s, v5.4s\n"
+ "sqadd v15.4s, v15.4s, v4.4s\n"
+ "and v7.16b, v16.16b, v0.16b\n"
+ "and v6.16b, v17.16b, v1.16b\n"
+ "and v5.16b, v18.16b, v2.16b\n"
+ "and v4.16b, v19.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v7.4s\n"
- "and v4.16b, v28.16b, v0.16b\n"
- "and v5.16b, v29.16b, v1.16b\n"
- "and v6.16b, v30.16b, v2.16b\n"
- "and v7.16b, v31.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v7.4s\n"
+ "sqadd v17.4s, v17.4s, v6.4s\n"
+ "sqadd v18.4s, v18.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v7.16b, v20.16b, v0.16b\n"
+ "and v6.16b, v21.16b, v1.16b\n"
+ "and v5.16b, v22.16b, v2.16b\n"
+ "and v4.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v7.4s\n"
+ "sqadd v21.4s, v21.4s, v6.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v7.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v1.16b\n"
+ "and v5.16b, v26.16b, v2.16b\n"
+ "and v4.16b, v27.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v7.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
+ "and v6.16b, v29.16b, v1.16b\n"
+ "and v5.16b, v30.16b, v2.16b\n"
+ "and v4.16b, v31.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v4.4s\n"
- "sqadd v29.4s, v29.4s, v5.4s\n"
- "sqadd v30.4s, v30.4s, v6.4s\n"
- "sqadd v31.4s, v31.4s, v7.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v7.4s\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "sqadd v30.4s, v30.4s, v5.4s\n"
+ "sqadd v31.4s, v31.4s, v4.4s\n"
"152:" // Height 6: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v6.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x20]\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x20]\n"
"srshl v14.4s, v14.4s, v2.4s\n"
"srshl v15.4s, v15.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -3293,225 +3292,225 @@ void a64_hybrid_s8qs_dot_6x16 (
"srshl v29.4s, v29.4s, v1.4s\n"
"srshl v30.4s, v30.4s, v2.4s\n"
"srshl v31.4s, v31.4s, v3.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
+ "add v8.4s, v8.4s, v6.4s\n"
+ "add v9.4s, v9.4s, v6.4s\n"
+ "add v10.4s, v10.4s, v6.4s\n"
+ "add v11.4s, v11.4s, v6.4s\n"
+ "add v12.4s, v12.4s, v6.4s\n"
+ "add v13.4s, v13.4s, v6.4s\n"
+ "add v14.4s, v14.4s, v6.4s\n"
+ "add v15.4s, v15.4s, v6.4s\n"
+ "add v16.4s, v16.4s, v6.4s\n"
+ "add v17.4s, v17.4s, v6.4s\n"
+ "add v18.4s, v18.4s, v6.4s\n"
+ "add v19.4s, v19.4s, v6.4s\n"
+ "add v20.4s, v20.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v6.4s\n"
+ "add v22.4s, v22.4s, v6.4s\n"
+ "add v23.4s, v23.4s, v6.4s\n"
+ "add v24.4s, v24.4s, v6.4s\n"
+ "add v25.4s, v25.4s, v6.4s\n"
+ "add v26.4s, v26.4s, v6.4s\n"
+ "add v27.4s, v27.4s, v6.4s\n"
+ "add v28.4s, v28.4s, v6.4s\n"
+ "add v29.4s, v29.4s, v6.4s\n"
+ "add v30.4s, v30.4s, v6.4s\n"
+ "add v31.4s, v31.4s, v6.4s\n"
+ "smin v8.4s, v8.4s, v5.4s\n"
+ "smin v9.4s, v9.4s, v5.4s\n"
+ "smin v10.4s, v10.4s, v5.4s\n"
+ "smin v11.4s, v11.4s, v5.4s\n"
+ "smin v12.4s, v12.4s, v5.4s\n"
+ "smin v13.4s, v13.4s, v5.4s\n"
+ "smin v14.4s, v14.4s, v5.4s\n"
+ "smin v15.4s, v15.4s, v5.4s\n"
+ "smin v16.4s, v16.4s, v5.4s\n"
+ "smin v17.4s, v17.4s, v5.4s\n"
+ "smin v18.4s, v18.4s, v5.4s\n"
+ "smin v19.4s, v19.4s, v5.4s\n"
+ "smin v20.4s, v20.4s, v5.4s\n"
+ "smin v21.4s, v21.4s, v5.4s\n"
+ "smin v22.4s, v22.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v5.4s\n"
+ "smin v24.4s, v24.4s, v5.4s\n"
+ "smin v25.4s, v25.4s, v5.4s\n"
+ "smin v26.4s, v26.4s, v5.4s\n"
+ "smin v27.4s, v27.4s, v5.4s\n"
+ "smin v28.4s, v28.4s, v5.4s\n"
+ "smin v29.4s, v29.4s, v5.4s\n"
+ "smin v30.4s, v30.4s, v5.4s\n"
+ "smin v31.4s, v31.4s, v5.4s\n"
+ "smax v8.4s, v8.4s, v4.4s\n"
+ "smax v9.4s, v9.4s, v4.4s\n"
+ "smax v10.4s, v10.4s, v4.4s\n"
+ "smax v11.4s, v11.4s, v4.4s\n"
+ "smax v12.4s, v12.4s, v4.4s\n"
+ "smax v13.4s, v13.4s, v4.4s\n"
+ "smax v14.4s, v14.4s, v4.4s\n"
+ "smax v15.4s, v15.4s, v4.4s\n"
+ "smax v16.4s, v16.4s, v4.4s\n"
+ "smax v17.4s, v17.4s, v4.4s\n"
+ "smax v18.4s, v18.4s, v4.4s\n"
+ "smax v19.4s, v19.4s, v4.4s\n"
+ "smax v20.4s, v20.4s, v4.4s\n"
+ "smax v21.4s, v21.4s, v4.4s\n"
+ "smax v22.4s, v22.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v4.4s\n"
+ "smax v24.4s, v24.4s, v4.4s\n"
+ "smax v25.4s, v25.4s, v4.4s\n"
+ "smax v26.4s, v26.4s, v4.4s\n"
+ "smax v27.4s, v27.4s, v4.4s\n"
+ "smax v28.4s, v28.4s, v4.4s\n"
+ "smax v29.4s, v29.4s, v4.4s\n"
+ "smax v30.4s, v30.4s, v4.4s\n"
+ "smax v31.4s, v31.4s, v4.4s\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v2.8h, v10.8h, v11.8h\n"
"uzp1 v12.8h, v12.8h, v13.8h\n"
- "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v1.8h, v14.8h, v15.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v0.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v19.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v29.8h, v30.8h, v31.8h\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
- "uzp1 v12.16b, v12.16b, v13.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
- "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "uzp1 v17.8h, v30.8h, v31.8h\n"
+ "uzp1 v8.16b, v8.16b, v2.16b\n"
+ "uzp1 v12.16b, v12.16b, v1.16b\n"
+ "uzp1 v16.16b, v16.16b, v0.16b\n"
+ "uzp1 v20.16b, v20.16b, v19.16b\n"
+ "uzp1 v24.16b, v24.16b, v18.16b\n"
+ "uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 161f\n"
"tbz x10, #3, 156f\n"
"str d8, [x11], #0x8\n"
- "str d12, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x10, #2, 154f\n"
"st1 { v8.s }[2], [x11], #0x4\n"
- "st1 { v12.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "st1 { v28.s }[2], [x20], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x10, #1, 153f\n"
"st1 { v8.h }[6], [x11], #0x2\n"
- "st1 { v12.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "st1 { v28.h }[6], [x20], #0x2\n"
+ "st1 { v12.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
"tbz x10, #0, 160f\n"
"st1 { v8.b }[14], [x11]\n"
- "st1 { v12.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
- "st1 { v28.b }[14], [x20]\n"
+ "st1 { v12.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 160f\n"
"153:" // Height 6: Partial direct writeback: partial_1_12
"tbz x10, #0, 160f\n"
"st1 { v8.b }[12], [x11]\n"
- "st1 { v12.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
- "st1 { v28.b }[12], [x20]\n"
+ "st1 { v12.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 160f\n"
"154:" // Height 6: Partial direct writeback: partial_2_8
"tbz x10, #1, 155f\n"
"st1 { v8.h }[4], [x11], #0x2\n"
- "st1 { v12.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "st1 { v28.h }[4], [x20], #0x2\n"
+ "st1 { v12.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
"tbz x10, #0, 160f\n"
"st1 { v8.b }[10], [x11]\n"
- "st1 { v12.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
- "st1 { v28.b }[10], [x20]\n"
+ "st1 { v12.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 160f\n"
"155:" // Height 6: Partial direct writeback: partial_1_8
"tbz x10, #0, 160f\n"
"st1 { v8.b }[8], [x11]\n"
- "st1 { v12.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
- "st1 { v28.b }[8], [x20]\n"
+ "st1 { v12.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 160f\n"
"156:" // Height 6: Partial direct writeback: partial_4_0
"tbz x10, #2, 158f\n"
"str s8, [x11], #0x4\n"
- "str s12, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "str s28, [x20], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x10, #1, 157f\n"
"st1 { v8.h }[2], [x11], #0x2\n"
- "st1 { v12.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "st1 { v28.h }[2], [x20], #0x2\n"
+ "st1 { v12.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
"tbz x10, #0, 160f\n"
"st1 { v8.b }[6], [x11]\n"
- "st1 { v12.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
- "st1 { v28.b }[6], [x20]\n"
+ "st1 { v12.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 160f\n"
"157:" // Height 6: Partial direct writeback: partial_1_4
"tbz x10, #0, 160f\n"
"st1 { v8.b }[4], [x11]\n"
- "st1 { v12.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
- "st1 { v28.b }[4], [x20]\n"
+ "st1 { v12.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 160f\n"
"158:" // Height 6: Partial direct writeback: partial_2_0
"tbz x10, #1, 159f\n"
"str h8, [x11], #0x2\n"
- "str h12, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "str h28, [x20], #0x2\n"
+ "str h12, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
"tbz x10, #0, 160f\n"
"st1 { v8.b }[2], [x11]\n"
- "st1 { v12.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
- "st1 { v28.b }[2], [x20]\n"
+ "st1 { v12.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 160f\n"
"159:" // Height 6: Partial direct writeback: partial_1_0
"str b8, [x11, #0x0]\n"
- "str b12, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
- "str b28, [x20, #0x0]\n"
+ "str b12, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"160:" // Height 6: Partial direct writeback: Done
"b 162f\n"
"161:" // Height 6: Full writeback
"str q8, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q12, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
- "str q28, [x20, #0x0]\n"
+ "str q12, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"162:" // Height 6: Writeback done
"subs x10, x10, #0x10\n"
"bgt 137b\n"
@@ -3527,7 +3526,6 @@ void a64_hybrid_s8qs_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"164:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
index 7eacdceae7..d0d5f1b80d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -98,5 +98,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
index fc525531b2..0771829d37 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
@@ -85,7 +85,6 @@ void a64_hybrid_s8qs_mmla_6x16 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 146f\n"
@@ -115,11 +114,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -135,41 +134,41 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q6, [x9, #0x10]\n"
"blt 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ "trn1 v18.2d, v1.2d, v21.2d\n"
+ ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"cmp x27, #0x20\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
+ ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
"ldr q1, [x26, #0x0]\n"
"add x9, x9, #0x100\n"
"ldr q7, [x9, #0x0]\n"
@@ -177,40 +176,40 @@ void a64_hybrid_s8qs_mmla_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
"bge 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ "trn1 v18.2d, v1.2d, v19.2d\n"
+ ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v19.2d\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
+ ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x9, x9, #0x100\n"
"9:" // Height 1: Multiply loop: Main loop skip
@@ -218,26 +217,26 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 11f\n"
"10:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr q6, [x9, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x70]\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr q17, [x9, #0x0]\n"
+ "trn1 v18.2d, v18.2d, v16.2d\n"
+ "ldr q31, [x9, #0x10]\n"
+ ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e9fa64c // smmla v12.4s, v18.16b, v31.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
"add x9, x9, #0x80\n"
"bge 10b\n"
"11:" // Height 1: Multiply loop: Skip odd blocks
@@ -262,44 +261,44 @@ void a64_hybrid_s8qs_mmla_6x16 (
"14:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x26, #0x0]\n"
"15:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q7, [x9, #0x0]\n"
- "ldr q6, [x9, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q19, [x9, #0x10]\n"
+ "trn1 v18.2d, v1.2d, v16.2d\n"
+ ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e93a64c // smmla v12.4s, v18.16b, v19.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
"add x9, x9, #0x80\n"
"16:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 4b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
+ "ldr q19, [x14, #0x0]\n"
+ "ldr q18, [x14, #0x10]\n"
"uzp1 v8.2d, v8.2d, v12.2d\n"
"uzp1 v9.2d, v9.2d, v13.2d\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
+ "ldr q17, [x14, #0x20]\n"
+ "ldr q16, [x14, #0x30]\n"
"uzp1 v10.2d, v10.2d, v14.2d\n"
"uzp1 v11.2d, v11.2d, v15.2d\n"
"mov v15.16b, v8.16b\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "add v15.4s, v15.4s, v0.4s\n"
+ "add v15.4s, v15.4s, v19.4s\n"
"add x14, x14, #0x40\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "add v10.4s, v10.4s, v17.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"tbz %x[flags], #4, 17f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -313,10 +312,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 18f\n"
"17:" // Height 1: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -329,45 +328,45 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
"sqrdmulh v11.4s, v11.4s, v7.4s\n"
"tbz %x[flags], #5, 19f\n"
- "and v4.16b, v15.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v17.16b, v15.16b, v0.16b\n"
+ "and v16.16b, v9.16b, v1.16b\n"
+ "and v25.16b, v10.16b, v2.16b\n"
+ "and v18.16b, v11.16b, v3.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v17.4s\n"
+ "sqadd v9.4s, v9.4s, v16.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v18.4s\n"
"19:" // Height 1: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add v15.4s, v15.4s, v18.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v10.4s, v10.4s, v18.4s\n"
+ "add v11.4s, v11.4s, v18.4s\n"
"cmp x10, #0x10\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v9.4s, v9.4s, v17.4s\n"
+ "smin v10.4s, v10.4s, v17.4s\n"
+ "smin v11.4s, v11.4s, v17.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
"uzp1 v15.8h, v15.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
- "uzp1 v15.16b, v15.16b, v9.16b\n"
+ "uzp1 v16.8h, v10.8h, v11.8h\n"
+ "uzp1 v15.16b, v15.16b, v16.16b\n"
"bge 28f\n"
"tbz x10, #3, 23f\n"
"str d15, [x11], #0x8\n"
@@ -442,12 +441,12 @@ void a64_hybrid_s8qs_mmla_6x16 (
"33:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 34f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 35f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -455,7 +454,7 @@ void a64_hybrid_s8qs_mmla_6x16 (
"b 35f\n"
"34:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"35:" // Height 2: input setup done
"cmp x27, #0x10\n"
"blt 38f\n"
@@ -466,85 +465,85 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q6, [x9, #0x10]\n"
"blt 37f\n"
"36:" // Height 2: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
+ "trn1 v18.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"ldr q2, [x25, #0x0]\n"
"cmp x27, #0x20\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
"add x9, x9, #0x100\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x9, #0x10]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"bge 36b\n"
"37:" // Height 2: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
+ "trn1 v18.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x80]\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x90]\n"
+ ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xa0]\n"
+ ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xb0]\n"
+ ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xc0]\n"
+ ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xd0]\n"
+ ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x9, #0xe0]\n"
+ ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n"
+ "ldr q16, [x9, #0xf0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n"
+ ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n"
"sub x27, x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
@@ -554,27 +553,27 @@ void a64_hybrid_s8qs_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 40f\n"
"39:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d16, [x25], #0x8\n"
+ "trn1 v18.2d, v17.2d, v16.2d\n"
"sub x27, x27, #0x8\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- "ldr q6, [x9, #0x20]\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- "ldr q6, [x9, #0x40]\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- "ldr q6, [x9, #0x60]\n"
- "ldr q7, [x9, #0x70]\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
+ ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ "ldr q16, [x9, #0x70]\n"
"cmp x27, #0x8\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
"add x9, x9, #0x80\n"
"bge 39b\n"
"40:" // Height 2: Multiply loop: Skip odd blocks
@@ -606,55 +605,55 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b1, [x26, #0x0]\n"
"ldr b2, [x25, #0x0]\n"
"44:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q7, [x9, #0x0]\n"
- "ldr q6, [x9, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ "ldr q17, [x9, #0x0]\n"
+ "ldr q16, [x9, #0x10]\n"
+ "trn1 v18.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x20]\n"
+ ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x30]\n"
+ ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x40]\n"
+ ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x50]\n"
+ ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n"
+ "ldr q17, [x9, #0x60]\n"
+ ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n"
+ "ldr q16, [x9, #0x70]\n"
+ ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n"
+ ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n"
"add x9, x9, #0x80\n"
"45:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 33b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
+ "ldr q19, [x14, #0x0]\n"
+ "ldr q18, [x14, #0x10]\n"
+ "uzp1 v17.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
+ "ldr q5, [x14, #0x20]\n"
+ "ldr q16, [x14, #0x30]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x11, x20\n"
+ "add x25, x11, x20\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x24, #0x0]\n"
- "mov v15.16b, v7.16b\n"
- "add v15.4s, v15.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "mov v15.16b, v17.16b\n"
+ "add v15.4s, v15.4s, v19.4s\n"
"add x14, x14, #0x40\n"
- "add v12.4s, v12.4s, v1.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add v14.4s, v14.4s, v3.4s\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "add v12.4s, v12.4s, v18.4s\n"
+ "add v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v16.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "add v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v16.4s\n"
"tbz %x[flags], #4, 46f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -668,10 +667,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 47f\n"
"46:" // Height 2: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -688,141 +687,141 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
"sqrdmulh v11.4s, v11.4s, v7.4s\n"
"tbz %x[flags], #5, 48f\n"
- "and v4.16b, v15.16b, v0.16b\n"
- "and v5.16b, v12.16b, v1.16b\n"
- "and v6.16b, v13.16b, v2.16b\n"
- "and v7.16b, v14.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sqadd v12.4s, v12.4s, v5.4s\n"
- "sqadd v13.4s, v13.4s, v6.4s\n"
- "sqadd v14.4s, v14.4s, v7.4s\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v19.16b, v15.16b, v0.16b\n"
+ "and v18.16b, v12.16b, v1.16b\n"
+ "and v17.16b, v13.16b, v2.16b\n"
+ "and v16.16b, v14.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v19.4s\n"
+ "sqadd v12.4s, v12.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v17.4s\n"
+ "sqadd v14.4s, v14.4s, v16.4s\n"
+ "and v19.16b, v8.16b, v0.16b\n"
+ "and v18.16b, v9.16b, v1.16b\n"
+ "and v17.16b, v10.16b, v2.16b\n"
+ "and v16.16b, v11.16b, v3.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v19.4s\n"
+ "sqadd v9.4s, v9.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
"48:" // Height 2: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
"srshl v15.4s, v15.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v17.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
"cmp x10, #0x10\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
+ "add v15.4s, v15.4s, v18.4s\n"
+ "add v12.4s, v12.4s, v18.4s\n"
+ "add v13.4s, v13.4s, v18.4s\n"
+ "add v14.4s, v14.4s, v18.4s\n"
+ "add v8.4s, v8.4s, v18.4s\n"
+ "add v9.4s, v9.4s, v18.4s\n"
+ "add v10.4s, v10.4s, v18.4s\n"
+ "add v11.4s, v11.4s, v18.4s\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v8.4s, v8.4s, v17.4s\n"
+ "smin v9.4s, v9.4s, v17.4s\n"
+ "smin v10.4s, v10.4s, v17.4s\n"
+ "smin v11.4s, v11.4s, v17.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
"uzp1 v15.8h, v15.8h, v12.8h\n"
- "uzp1 v12.8h, v13.8h, v14.8h\n"
+ "uzp1 v17.8h, v13.8h, v14.8h\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
- "uzp1 v15.16b, v15.16b, v12.16b\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v16.8h, v10.8h, v11.8h\n"
+ "uzp1 v15.16b, v15.16b, v17.16b\n"
+ "uzp1 v8.16b, v8.16b, v16.16b\n"
"bge 57f\n"
"tbz x10, #3, 52f\n"
"str d15, [x11], #0x8\n"
- "str d8, [x24], #0x8\n"
+ "str d8, [x25], #0x8\n"
"tbz x10, #2, 50f\n"
"st1 { v15.s }[2], [x11], #0x4\n"
- "st1 { v8.s }[2], [x24], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
"tbz x10, #1, 49f\n"
"st1 { v15.h }[6], [x11], #0x2\n"
- "st1 { v8.h }[6], [x24], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
"tbz x10, #0, 56f\n"
"st1 { v15.b }[14], [x11]\n"
- "st1 { v8.b }[14], [x24]\n"
+ "st1 { v8.b }[14], [x25]\n"
"b 56f\n"
"49:" // Height 2: Partial direct writeback: partial_1_12
"tbz x10, #0, 56f\n"
"st1 { v15.b }[12], [x11]\n"
- "st1 { v8.b }[12], [x24]\n"
+ "st1 { v8.b }[12], [x25]\n"
"b 56f\n"
"50:" // Height 2: Partial direct writeback: partial_2_8
"tbz x10, #1, 51f\n"
"st1 { v15.h }[4], [x11], #0x2\n"
- "st1 { v8.h }[4], [x24], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
"tbz x10, #0, 56f\n"
"st1 { v15.b }[10], [x11]\n"
- "st1 { v8.b }[10], [x24]\n"
+ "st1 { v8.b }[10], [x25]\n"
"b 56f\n"
"51:" // Height 2: Partial direct writeback: partial_1_8
"tbz x10, #0, 56f\n"
"st1 { v15.b }[8], [x11]\n"
- "st1 { v8.b }[8], [x24]\n"
+ "st1 { v8.b }[8], [x25]\n"
"b 56f\n"
"52:" // Height 2: Partial direct writeback: partial_4_0
"tbz x10, #2, 54f\n"
"str s15, [x11], #0x4\n"
- "str s8, [x24], #0x4\n"
+ "str s8, [x25], #0x4\n"
"tbz x10, #1, 53f\n"
"st1 { v15.h }[2], [x11], #0x2\n"
- "st1 { v8.h }[2], [x24], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
"tbz x10, #0, 56f\n"
"st1 { v15.b }[6], [x11]\n"
- "st1 { v8.b }[6], [x24]\n"
+ "st1 { v8.b }[6], [x25]\n"
"b 56f\n"
"53:" // Height 2: Partial direct writeback: partial_1_4
"tbz x10, #0, 56f\n"
"st1 { v15.b }[4], [x11]\n"
- "st1 { v8.b }[4], [x24]\n"
+ "st1 { v8.b }[4], [x25]\n"
"b 56f\n"
"54:" // Height 2: Partial direct writeback: partial_2_0
"tbz x10, #1, 55f\n"
"str h15, [x11], #0x2\n"
- "str h8, [x24], #0x2\n"
+ "str h8, [x25], #0x2\n"
"tbz x10, #0, 56f\n"
"st1 { v15.b }[2], [x11]\n"
- "st1 { v8.b }[2], [x24]\n"
+ "st1 { v8.b }[2], [x25]\n"
"b 56f\n"
"55:" // Height 2: Partial direct writeback: partial_1_0
"str b15, [x11, #0x0]\n"
- "str b8, [x24, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
"56:" // Height 2: Partial direct writeback: Done
"b 58f\n"
"57:" // Height 2: Full writeback
"str q15, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q8, [x24, #0x0]\n"
+ "str q8, [x25, #0x0]\n"
"58:" // Height 2: Writeback done
"subs x10, x10, #0x10\n"
"bgt 31b\n"
@@ -856,13 +855,13 @@ void a64_hybrid_s8qs_mmla_6x16 (
"62:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 63f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 64f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -871,8 +870,8 @@ void a64_hybrid_s8qs_mmla_6x16 (
"b 64f\n"
"63:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"64:" // Height 3: input setup done
"cmp x27, #0x10\n"
"blt 67f\n"
@@ -884,167 +883,167 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q6, [x9, #0x10]\n"
"blt 66f\n"
"65:" // Height 3: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
+ "trn1 v26.2d, v3.2d, v28.2d\n"
+ ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
+ ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ "trn2 v3.2d, v3.2d, v28.2d\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x80]\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
"cmp x27, #0x20\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xa0]\n"
+ ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xb0]\n"
+ ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xc0]\n"
+ ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xd0]\n"
+ ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xe0]\n"
+ ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x9, #0x10]\n"
"bge 65b\n"
"66:" // Height 3: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
+ "trn1 v26.2d, v3.2d, v25.2d\n"
+ ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
+ "ldr q24, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
+ ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
+ "ldr q0, [x9, #0x30]\n"
+ ".inst 0x4e98a769 // smmla v9.4s, v27.16b, v24.16b\n"
+ "trn2 v3.2d, v3.2d, v25.2d\n"
+ ".inst 0x4e98a751 // smmla v17.4s, v26.16b, v24.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e80a76d // smmla v13.4s, v27.16b, v0.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a755 // smmla v21.4s, v26.16b, v0.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x80]\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x90]\n"
+ ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xa0]\n"
+ ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xb0]\n"
+ ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xc0]\n"
+ ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xd0]\n"
+ ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xe0]\n"
+ ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
+ ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n"
"67:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 74f\n"
"cmp x27, #0x8\n"
"blt 69f\n"
"68:" // Height 3: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr q6, [x9, #0x0]\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ "ldr d25, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "trn1 v27.2d, v25.2d, v24.2d\n"
+ "ldr d24, [x24], #0x8\n"
+ "ldr q25, [x9, #0x0]\n"
+ "trn1 v26.2d, v24.2d, v26.2d\n"
+ ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
+ "ldr q24, [x9, #0x10]\n"
+ ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
"sub x27, x27, #0x8\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
"cmp x27, #0x8\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
"bge 68b\n"
"69:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 74f\n"
@@ -1082,74 +1081,74 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b2, [x25, #0x0]\n"
"ldr b3, [x24, #0x0]\n"
"73:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q7, [x9, #0x0]\n"
- "ldr q6, [x9, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q28, [x9, #0x10]\n"
+ "trn1 v27.2d, v1.2d, v2.2d\n"
+ "trn1 v26.2d, v3.2d, v24.2d\n"
+ ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e9ca76c // smmla v12.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9ca754 // smmla v20.4s, v26.16b, v28.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
"74:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 62b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
+ "ldr q28, [x14, #0x0]\n"
+ "ldr q27, [x14, #0x10]\n"
+ "uzp1 v26.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
+ "ldr q25, [x14, #0x20]\n"
+ "ldr q24, [x14, #0x30]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x11, x20\n"
+ "add x25, x11, x20\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
- "add x23, x24, x20\n"
+ "add x24, x25, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
"uzp1 v16.2d, v16.2d, v20.2d\n"
"uzp1 v17.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v18.2d, v18.2d, v22.2d\n"
"uzp1 v19.2d, v19.2d, v23.2d\n"
"add x14, x14, #0x40\n"
- "mov v23.16b, v7.16b\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v12.4s, v12.4s, v1.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add v14.4s, v14.4s, v3.4s\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "mov v23.16b, v26.16b\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v12.4s, v12.4s, v27.4s\n"
+ "add v13.4s, v13.4s, v25.4s\n"
+ "add v14.4s, v14.4s, v24.4s\n"
+ "add v8.4s, v8.4s, v28.4s\n"
+ "add v9.4s, v9.4s, v27.4s\n"
+ "add v10.4s, v10.4s, v25.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v27.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
"tbz %x[flags], #4, 75f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -1163,10 +1162,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 76f\n"
"75:" // Height 3: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -1187,55 +1186,55 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v18.4s, v18.4s, v6.4s\n"
"sqrdmulh v19.4s, v19.4s, v7.4s\n"
"tbz %x[flags], #5, 77f\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v12.16b, v1.16b\n"
- "and v6.16b, v13.16b, v2.16b\n"
- "and v7.16b, v14.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v12.4s, v12.4s, v5.4s\n"
- "sqadd v13.4s, v13.4s, v6.4s\n"
- "sqadd v14.4s, v14.4s, v7.4s\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "and v22.16b, v12.16b, v1.16b\n"
+ "and v21.16b, v13.16b, v2.16b\n"
+ "and v20.16b, v14.16b, v3.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
+ "sqadd v12.4s, v12.4s, v22.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sqadd v14.4s, v14.4s, v20.4s\n"
+ "and v24.16b, v8.16b, v0.16b\n"
+ "and v22.16b, v9.16b, v1.16b\n"
+ "and v21.16b, v10.16b, v2.16b\n"
+ "and v20.16b, v11.16b, v3.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v24.4s\n"
+ "sqadd v9.4s, v9.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sqadd v11.4s, v11.4s, v20.4s\n"
+ "and v24.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v1.16b\n"
+ "and v21.16b, v18.16b, v2.16b\n"
+ "and v20.16b, v19.16b, v3.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"77:" // Height 3: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -1243,132 +1242,132 @@ void a64_hybrid_s8qs_mmla_6x16 (
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
"srshl v19.4s, v19.4s, v3.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add v23.4s, v23.4s, v22.4s\n"
+ "add v12.4s, v12.4s, v22.4s\n"
+ "add v13.4s, v13.4s, v22.4s\n"
+ "add v14.4s, v14.4s, v22.4s\n"
+ "add v8.4s, v8.4s, v22.4s\n"
+ "add v9.4s, v9.4s, v22.4s\n"
+ "add v10.4s, v10.4s, v22.4s\n"
+ "add v11.4s, v11.4s, v22.4s\n"
+ "add v16.4s, v16.4s, v22.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v22.4s\n"
+ "add v19.4s, v19.4s, v22.4s\n"
+ "smin v23.4s, v23.4s, v21.4s\n"
+ "smin v12.4s, v12.4s, v21.4s\n"
+ "smin v13.4s, v13.4s, v21.4s\n"
+ "smin v14.4s, v14.4s, v21.4s\n"
+ "smin v8.4s, v8.4s, v21.4s\n"
+ "smin v9.4s, v9.4s, v21.4s\n"
+ "smin v10.4s, v10.4s, v21.4s\n"
+ "smin v11.4s, v11.4s, v21.4s\n"
+ "smin v16.4s, v16.4s, v21.4s\n"
+ "smin v17.4s, v17.4s, v21.4s\n"
+ "smin v18.4s, v18.4s, v21.4s\n"
+ "smin v19.4s, v19.4s, v21.4s\n"
+ "smax v23.4s, v23.4s, v20.4s\n"
+ "smax v12.4s, v12.4s, v20.4s\n"
+ "smax v13.4s, v13.4s, v20.4s\n"
+ "smax v14.4s, v14.4s, v20.4s\n"
+ "smax v8.4s, v8.4s, v20.4s\n"
+ "smax v9.4s, v9.4s, v20.4s\n"
+ "smax v10.4s, v10.4s, v20.4s\n"
+ "smax v11.4s, v11.4s, v20.4s\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v23.8h, v23.8h, v12.8h\n"
- "uzp1 v12.8h, v13.8h, v14.8h\n"
+ "uzp1 v21.8h, v13.8h, v14.8h\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v20.8h, v10.8h, v11.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v23.16b, v23.16b, v12.16b\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v23.16b, v23.16b, v21.16b\n"
+ "uzp1 v8.16b, v8.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 86f\n"
"tbz x10, #3, 81f\n"
"str d23, [x11], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d16, [x23], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
"tbz x10, #2, 79f\n"
"st1 { v23.s }[2], [x11], #0x4\n"
- "st1 { v8.s }[2], [x24], #0x4\n"
- "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
"tbz x10, #1, 78f\n"
"st1 { v23.h }[6], [x11], #0x2\n"
- "st1 { v8.h }[6], [x24], #0x2\n"
- "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v16.h }[6], [x24], #0x2\n"
"tbz x10, #0, 85f\n"
"st1 { v23.b }[14], [x11]\n"
- "st1 { v8.b }[14], [x24]\n"
- "st1 { v16.b }[14], [x23]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v16.b }[14], [x24]\n"
"b 85f\n"
"78:" // Height 3: Partial direct writeback: partial_1_12
"tbz x10, #0, 85f\n"
"st1 { v23.b }[12], [x11]\n"
- "st1 { v8.b }[12], [x24]\n"
- "st1 { v16.b }[12], [x23]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v16.b }[12], [x24]\n"
"b 85f\n"
"79:" // Height 3: Partial direct writeback: partial_2_8
"tbz x10, #1, 80f\n"
"st1 { v23.h }[4], [x11], #0x2\n"
- "st1 { v8.h }[4], [x24], #0x2\n"
- "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v16.h }[4], [x24], #0x2\n"
"tbz x10, #0, 85f\n"
"st1 { v23.b }[10], [x11]\n"
- "st1 { v8.b }[10], [x24]\n"
- "st1 { v16.b }[10], [x23]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v16.b }[10], [x24]\n"
"b 85f\n"
"80:" // Height 3: Partial direct writeback: partial_1_8
"tbz x10, #0, 85f\n"
"st1 { v23.b }[8], [x11]\n"
- "st1 { v8.b }[8], [x24]\n"
- "st1 { v16.b }[8], [x23]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v16.b }[8], [x24]\n"
"b 85f\n"
"81:" // Height 3: Partial direct writeback: partial_4_0
"tbz x10, #2, 83f\n"
"str s23, [x11], #0x4\n"
- "str s8, [x24], #0x4\n"
- "str s16, [x23], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
"tbz x10, #1, 82f\n"
"st1 { v23.h }[2], [x11], #0x2\n"
- "st1 { v8.h }[2], [x24], #0x2\n"
- "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v16.h }[2], [x24], #0x2\n"
"tbz x10, #0, 85f\n"
"st1 { v23.b }[6], [x11]\n"
- "st1 { v8.b }[6], [x24]\n"
- "st1 { v16.b }[6], [x23]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v16.b }[6], [x24]\n"
"b 85f\n"
"82:" // Height 3: Partial direct writeback: partial_1_4
"tbz x10, #0, 85f\n"
"st1 { v23.b }[4], [x11]\n"
- "st1 { v8.b }[4], [x24]\n"
- "st1 { v16.b }[4], [x23]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v16.b }[4], [x24]\n"
"b 85f\n"
"83:" // Height 3: Partial direct writeback: partial_2_0
"tbz x10, #1, 84f\n"
"str h23, [x11], #0x2\n"
- "str h8, [x24], #0x2\n"
- "str h16, [x23], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h16, [x24], #0x2\n"
"tbz x10, #0, 85f\n"
"st1 { v23.b }[2], [x11]\n"
- "st1 { v8.b }[2], [x24]\n"
- "st1 { v16.b }[2], [x23]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v16.b }[2], [x24]\n"
"b 85f\n"
"84:" // Height 3: Partial direct writeback: partial_1_0
"str b23, [x11, #0x0]\n"
- "str b8, [x24, #0x0]\n"
- "str b16, [x23, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b16, [x24, #0x0]\n"
"85:" // Height 3: Partial direct writeback: Done
"b 87f\n"
"86:" // Height 3: Full writeback
"str q23, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q8, [x24, #0x0]\n"
- "str q16, [x23, #0x0]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q16, [x24, #0x0]\n"
"87:" // Height 3: Writeback done
"subs x10, x10, #0x10\n"
"bgt 60b\n"
@@ -1402,14 +1401,14 @@ void a64_hybrid_s8qs_mmla_6x16 (
"91:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 92f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 93f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1419,9 +1418,9 @@ void a64_hybrid_s8qs_mmla_6x16 (
"b 93f\n"
"92:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"93:" // Height 4: input setup done
"cmp x27, #0x10\n"
"blt 96f\n"
@@ -1434,173 +1433,173 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q6, [x9, #0x10]\n"
"blt 95f\n"
"94:" // Height 4: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "trn1 v26.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
+ ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
"add x23, x23, #0x10\n"
"ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x80]\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xa0]\n"
+ ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
"cmp x27, #0x20\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xb0]\n"
+ ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xc0]\n"
+ ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xd0]\n"
+ ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xe0]\n"
+ ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x9, #0x10]\n"
"bge 94b\n"
"95:" // Height 4: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "trn1 v26.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n"
+ ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
"add x23, x23, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x80]\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x90]\n"
+ ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xa0]\n"
+ ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xb0]\n"
+ ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xc0]\n"
+ ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xd0]\n"
+ ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x9, #0xe0]\n"
+ ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n"
+ "ldr q24, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n"
+ ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n"
+ ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n"
"96:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 103f\n"
"cmp x27, #0x8\n"
"blt 98f\n"
"97:" // Height 4: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d25, [x26], #0x8\n"
+ "ldr d24, [x25], #0x8\n"
+ "trn1 v27.2d, v25.2d, v24.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "trn1 v26.2d, v25.2d, v24.2d\n"
"cmp x27, #0x8\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x70]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
+ ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
"bge 97b\n"
"98:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 103f\n"
@@ -1645,84 +1644,84 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b3, [x24, #0x0]\n"
"ldr b4, [x23, #0x0]\n"
"102:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q7, [x9, #0x0]\n"
- "ldr q6, [x9, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "ldr q24, [x9, #0x10]\n"
+ "trn1 v27.2d, v1.2d, v2.2d\n"
+ "trn1 v26.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x20]\n"
+ ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x30]\n"
+ ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x40]\n"
+ ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x50]\n"
+ ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n"
+ "ldr q25, [x9, #0x60]\n"
+ ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n"
+ "ldr q24, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n"
+ ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n"
"103:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 91b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
+ "ldr q28, [x14, #0x0]\n"
+ "ldr q27, [x14, #0x10]\n"
+ "uzp1 v26.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
+ "ldr q25, [x14, #0x20]\n"
+ "ldr q24, [x14, #0x30]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
- "add x24, x11, x20\n"
+ "add x25, x11, x20\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
- "add x22, x23, x20\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x24, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"add x14, x14, #0x40\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "mov v23.16b, v7.16b\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v12.4s, v12.4s, v1.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add v14.4s, v14.4s, v3.4s\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
- "add v15.4s, v15.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "mov v23.16b, v26.16b\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v12.4s, v12.4s, v27.4s\n"
+ "add v13.4s, v13.4s, v25.4s\n"
+ "add v14.4s, v14.4s, v24.4s\n"
+ "add v8.4s, v8.4s, v28.4s\n"
+ "add v9.4s, v9.4s, v27.4s\n"
+ "add v10.4s, v10.4s, v25.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
+ "add v15.4s, v15.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v25.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v27.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
"tbz %x[flags], #4, 104f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -1736,10 +1735,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 105f\n"
"104:" // Height 4: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -1764,67 +1763,67 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v18.4s, v18.4s, v6.4s\n"
"sqrdmulh v19.4s, v19.4s, v7.4s\n"
"tbz %x[flags], #5, 106f\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v12.16b, v1.16b\n"
- "and v6.16b, v13.16b, v2.16b\n"
- "and v7.16b, v14.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v12.4s, v12.4s, v5.4s\n"
- "sqadd v13.4s, v13.4s, v6.4s\n"
- "sqadd v14.4s, v14.4s, v7.4s\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v15.16b, v0.16b\n"
- "and v5.16b, v20.16b, v1.16b\n"
- "and v6.16b, v21.16b, v2.16b\n"
- "and v7.16b, v22.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v27.16b, v23.16b, v0.16b\n"
+ "and v26.16b, v12.16b, v1.16b\n"
+ "and v25.16b, v13.16b, v2.16b\n"
+ "and v24.16b, v14.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v27.4s\n"
+ "sqadd v12.4s, v12.4s, v26.4s\n"
+ "sqadd v13.4s, v13.4s, v25.4s\n"
+ "sqadd v14.4s, v14.4s, v24.4s\n"
+ "and v27.16b, v8.16b, v0.16b\n"
+ "and v26.16b, v9.16b, v1.16b\n"
+ "and v25.16b, v10.16b, v2.16b\n"
+ "and v24.16b, v11.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "sqadd v9.4s, v9.4s, v26.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v24.4s\n"
+ "and v27.16b, v15.16b, v0.16b\n"
+ "and v26.16b, v20.16b, v1.16b\n"
+ "and v25.16b, v21.16b, v2.16b\n"
+ "and v24.16b, v22.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v27.4s\n"
+ "sqadd v20.4s, v20.4s, v26.4s\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v1.16b\n"
+ "and v25.16b, v18.16b, v2.16b\n"
+ "and v24.16b, v19.16b, v3.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "sqadd v18.4s, v18.4s, v25.4s\n"
+ "sqadd v19.4s, v19.4s, v24.4s\n"
"106:" // Height 4: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -1836,163 +1835,163 @@ void a64_hybrid_s8qs_mmla_6x16 (
"srshl v17.4s, v17.4s, v1.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
"srshl v19.4s, v19.4s, v3.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add v23.4s, v23.4s, v26.4s\n"
+ "add v12.4s, v12.4s, v26.4s\n"
+ "add v13.4s, v13.4s, v26.4s\n"
+ "add v14.4s, v14.4s, v26.4s\n"
+ "add v8.4s, v8.4s, v26.4s\n"
+ "add v9.4s, v9.4s, v26.4s\n"
+ "add v10.4s, v10.4s, v26.4s\n"
+ "add v11.4s, v11.4s, v26.4s\n"
+ "add v15.4s, v15.4s, v26.4s\n"
+ "add v20.4s, v20.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v16.4s, v16.4s, v26.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v26.4s\n"
+ "smin v23.4s, v23.4s, v25.4s\n"
+ "smin v12.4s, v12.4s, v25.4s\n"
+ "smin v13.4s, v13.4s, v25.4s\n"
+ "smin v14.4s, v14.4s, v25.4s\n"
+ "smin v8.4s, v8.4s, v25.4s\n"
+ "smin v9.4s, v9.4s, v25.4s\n"
+ "smin v10.4s, v10.4s, v25.4s\n"
+ "smin v11.4s, v11.4s, v25.4s\n"
+ "smin v15.4s, v15.4s, v25.4s\n"
+ "smin v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v25.4s\n"
+ "smin v22.4s, v22.4s, v25.4s\n"
+ "smin v16.4s, v16.4s, v25.4s\n"
+ "smin v17.4s, v17.4s, v25.4s\n"
+ "smin v18.4s, v18.4s, v25.4s\n"
+ "smin v19.4s, v19.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "smax v12.4s, v12.4s, v24.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smax v14.4s, v14.4s, v24.4s\n"
+ "smax v8.4s, v8.4s, v24.4s\n"
+ "smax v9.4s, v9.4s, v24.4s\n"
+ "smax v10.4s, v10.4s, v24.4s\n"
+ "smax v11.4s, v11.4s, v24.4s\n"
+ "smax v15.4s, v15.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
"uzp1 v23.8h, v23.8h, v12.8h\n"
- "uzp1 v12.8h, v13.8h, v14.8h\n"
+ "uzp1 v25.8h, v13.8h, v14.8h\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v24.8h, v10.8h, v11.8h\n"
"uzp1 v15.8h, v15.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v23.16b, v23.16b, v12.16b\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v23.16b, v23.16b, v25.16b\n"
+ "uzp1 v8.16b, v8.16b, v24.16b\n"
"uzp1 v15.16b, v15.16b, v20.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 115f\n"
"tbz x10, #3, 110f\n"
"str d23, [x11], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d15, [x23], #0x8\n"
- "str d16, [x22], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x10, #2, 108f\n"
"st1 { v23.s }[2], [x11], #0x4\n"
- "st1 { v8.s }[2], [x24], #0x4\n"
- "st1 { v15.s }[2], [x23], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
"tbz x10, #1, 107f\n"
"st1 { v23.h }[6], [x11], #0x2\n"
- "st1 { v8.h }[6], [x24], #0x2\n"
- "st1 { v15.h }[6], [x23], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v15.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
"tbz x10, #0, 114f\n"
"st1 { v23.b }[14], [x11]\n"
- "st1 { v8.b }[14], [x24]\n"
- "st1 { v15.b }[14], [x23]\n"
- "st1 { v16.b }[14], [x22]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v15.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
"b 114f\n"
"107:" // Height 4: Partial direct writeback: partial_1_12
"tbz x10, #0, 114f\n"
"st1 { v23.b }[12], [x11]\n"
- "st1 { v8.b }[12], [x24]\n"
- "st1 { v15.b }[12], [x23]\n"
- "st1 { v16.b }[12], [x22]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v15.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
"b 114f\n"
"108:" // Height 4: Partial direct writeback: partial_2_8
"tbz x10, #1, 109f\n"
"st1 { v23.h }[4], [x11], #0x2\n"
- "st1 { v8.h }[4], [x24], #0x2\n"
- "st1 { v15.h }[4], [x23], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v15.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
"tbz x10, #0, 114f\n"
"st1 { v23.b }[10], [x11]\n"
- "st1 { v8.b }[10], [x24]\n"
- "st1 { v15.b }[10], [x23]\n"
- "st1 { v16.b }[10], [x22]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v15.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
"b 114f\n"
"109:" // Height 4: Partial direct writeback: partial_1_8
"tbz x10, #0, 114f\n"
"st1 { v23.b }[8], [x11]\n"
- "st1 { v8.b }[8], [x24]\n"
- "st1 { v15.b }[8], [x23]\n"
- "st1 { v16.b }[8], [x22]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v15.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
"b 114f\n"
"110:" // Height 4: Partial direct writeback: partial_4_0
"tbz x10, #2, 112f\n"
"str s23, [x11], #0x4\n"
- "str s8, [x24], #0x4\n"
- "str s15, [x23], #0x4\n"
- "str s16, [x22], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
"tbz x10, #1, 111f\n"
"st1 { v23.h }[2], [x11], #0x2\n"
- "st1 { v8.h }[2], [x24], #0x2\n"
- "st1 { v15.h }[2], [x23], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v15.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
"tbz x10, #0, 114f\n"
"st1 { v23.b }[6], [x11]\n"
- "st1 { v8.b }[6], [x24]\n"
- "st1 { v15.b }[6], [x23]\n"
- "st1 { v16.b }[6], [x22]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v15.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
"b 114f\n"
"111:" // Height 4: Partial direct writeback: partial_1_4
"tbz x10, #0, 114f\n"
"st1 { v23.b }[4], [x11]\n"
- "st1 { v8.b }[4], [x24]\n"
- "st1 { v15.b }[4], [x23]\n"
- "st1 { v16.b }[4], [x22]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v15.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
"b 114f\n"
"112:" // Height 4: Partial direct writeback: partial_2_0
"tbz x10, #1, 113f\n"
"str h23, [x11], #0x2\n"
- "str h8, [x24], #0x2\n"
- "str h15, [x23], #0x2\n"
- "str h16, [x22], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h15, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
"tbz x10, #0, 114f\n"
"st1 { v23.b }[2], [x11]\n"
- "st1 { v8.b }[2], [x24]\n"
- "st1 { v15.b }[2], [x23]\n"
- "st1 { v16.b }[2], [x22]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v15.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
"b 114f\n"
"113:" // Height 4: Partial direct writeback: partial_1_0
"str b23, [x11, #0x0]\n"
- "str b8, [x24, #0x0]\n"
- "str b15, [x23, #0x0]\n"
- "str b16, [x22, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b15, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
"114:" // Height 4: Partial direct writeback: Done
"b 116f\n"
"115:" // Height 4: Full writeback
"str q23, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q8, [x24, #0x0]\n"
- "str q15, [x23, #0x0]\n"
- "str q16, [x22, #0x0]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
"116:" // Height 4: Writeback done
"subs x10, x10, #0x10\n"
"bgt 89b\n"
@@ -2034,15 +2033,15 @@ void a64_hybrid_s8qs_mmla_6x16 (
"120:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 121f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 122f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2053,10 +2052,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"b 122f\n"
"121:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"122:" // Height 5: input setup done
"cmp x27, #0x10\n"
"blt 125f\n"
@@ -2120,42 +2119,42 @@ void a64_hybrid_s8qs_mmla_6x16 (
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
"ldr q2, [x25, #0x0]\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ "ldr q6, [x9, #0xa0]\n"
+ ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xb0]\n"
+ ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x9, #0xc0]\n"
+ ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xd0]\n"
+ ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x9, #0xe0]\n"
+ ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n"
"ldr q5, [x22, #0x0]\n"
"bge 123b\n"
"124:" // Height 5: Multiply loop: Single iteration only
@@ -2208,86 +2207,86 @@ void a64_hybrid_s8qs_mmla_6x16 (
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ "ldr q2, [x9, #0xa0]\n"
+ ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xb0]\n"
+ ".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x9, #0xc0]\n"
+ ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xd0]\n"
+ ".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x9, #0xe0]\n"
+ ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n"
+ ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n"
"125:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 132f\n"
"cmp x27, #0x8\n"
"blt 127f\n"
"126:" // Height 5: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d5, [x22], #0x8\n"
- "ldr q6, [x9, #0x0]\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
+ "ldr d0, [x22], #0x8\n"
+ "ldr q1, [x9, #0x0]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
+ ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x20]\n"
+ ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
"cmp x27, #0x8\n"
- ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x70]\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x30]\n"
+ ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x40]\n"
+ ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x50]\n"
+ ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x60]\n"
+ ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x70]\n"
+ ".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n"
+ ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n"
+ ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n"
"bge 126b\n"
"127:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 132f\n"
@@ -2340,74 +2339,74 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b5, [x22, #0x0]\n"
"131:" // Height 5: Multiply loop: Ragged operand read: Done
"ldr q7, [x9, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ "trn1 v2.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
+ ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
+ "ldr q1, [x9, #0x20]\n"
+ ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x30]\n"
+ ".inst 0x4e81a4c9 // smmla v9.4s, v6.16b, v1.16b\n"
+ ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x40]\n"
+ ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x50]\n"
+ ".inst 0x4e81a4ca // smmla v10.4s, v6.16b, v1.16b\n"
+ ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x60]\n"
+ ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
+ ".inst 0x4e81a4cb // smmla v11.4s, v6.16b, v1.16b\n"
+ ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n"
+ ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n"
"132:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 120b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
+ "ldr q4, [x14, #0x0]\n"
+ "ldr q3, [x14, #0x10]\n"
+ "uzp1 v2.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q0, [x14, #0x30]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
+ "add x25, x11, x20\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
- "add x22, x23, x20\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
- "add x21, x22, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"add x14, x14, #0x40\n"
@@ -2415,27 +2414,27 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp1 v25.2d, v25.2d, v29.2d\n"
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v7.16b\n"
- "add v31.4s, v31.4s, v0.4s\n"
- "add v12.4s, v12.4s, v1.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add v14.4s, v14.4s, v3.4s\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
- "add v15.4s, v15.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
+ "mov v31.16b, v2.16b\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v0.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v3.4s\n"
+ "add v10.4s, v10.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v0.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
"tbz %x[flags], #4, 133f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -2449,10 +2448,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 134f\n"
"133:" // Height 5: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -2481,79 +2480,79 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v26.4s, v26.4s, v6.4s\n"
"sqrdmulh v27.4s, v27.4s, v7.4s\n"
"tbz %x[flags], #5, 135f\n"
- "and v4.16b, v31.16b, v0.16b\n"
- "and v5.16b, v12.16b, v1.16b\n"
- "and v6.16b, v13.16b, v2.16b\n"
- "and v7.16b, v14.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v4.4s\n"
- "sqadd v12.4s, v12.4s, v5.4s\n"
- "sqadd v13.4s, v13.4s, v6.4s\n"
- "sqadd v14.4s, v14.4s, v7.4s\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v15.16b, v0.16b\n"
- "and v5.16b, v20.16b, v1.16b\n"
- "and v6.16b, v21.16b, v2.16b\n"
- "and v7.16b, v22.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v5.16b, v25.16b, v1.16b\n"
- "and v6.16b, v26.16b, v2.16b\n"
- "and v7.16b, v27.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v7.4s\n"
+ "and v30.16b, v31.16b, v0.16b\n"
+ "and v29.16b, v12.16b, v1.16b\n"
+ "and v28.16b, v13.16b, v2.16b\n"
+ "and v23.16b, v14.16b, v3.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v30.4s\n"
+ "sqadd v12.4s, v12.4s, v29.4s\n"
+ "sqadd v13.4s, v13.4s, v28.4s\n"
+ "sqadd v14.4s, v14.4s, v23.4s\n"
+ "and v30.16b, v8.16b, v0.16b\n"
+ "and v29.16b, v9.16b, v1.16b\n"
+ "and v28.16b, v10.16b, v2.16b\n"
+ "and v23.16b, v11.16b, v3.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v30.4s\n"
+ "sqadd v9.4s, v9.4s, v29.4s\n"
+ "sqadd v10.4s, v10.4s, v28.4s\n"
+ "sqadd v11.4s, v11.4s, v23.4s\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "and v29.16b, v20.16b, v1.16b\n"
+ "and v28.16b, v21.16b, v2.16b\n"
+ "and v23.16b, v22.16b, v3.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v21.4s, v21.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v23.4s\n"
+ "and v30.16b, v16.16b, v0.16b\n"
+ "and v29.16b, v17.16b, v1.16b\n"
+ "and v28.16b, v18.16b, v2.16b\n"
+ "and v23.16b, v19.16b, v3.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v23.4s\n"
+ "and v30.16b, v24.16b, v0.16b\n"
+ "and v29.16b, v25.16b, v1.16b\n"
+ "and v28.16b, v26.16b, v2.16b\n"
+ "and v23.16b, v27.16b, v3.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v30.4s\n"
+ "sqadd v25.4s, v25.4s, v29.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v27.4s, v27.4s, v23.4s\n"
"135:" // Height 5: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v23.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -2569,194 +2568,194 @@ void a64_hybrid_s8qs_mmla_6x16 (
"srshl v25.4s, v25.4s, v1.4s\n"
"srshl v26.4s, v26.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v31.4s, v31.4s, v29.4s\n"
+ "add v12.4s, v12.4s, v29.4s\n"
+ "add v13.4s, v13.4s, v29.4s\n"
+ "add v14.4s, v14.4s, v29.4s\n"
+ "add v8.4s, v8.4s, v29.4s\n"
+ "add v9.4s, v9.4s, v29.4s\n"
+ "add v10.4s, v10.4s, v29.4s\n"
+ "add v11.4s, v11.4s, v29.4s\n"
+ "add v15.4s, v15.4s, v29.4s\n"
+ "add v20.4s, v20.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v16.4s, v16.4s, v29.4s\n"
+ "add v17.4s, v17.4s, v29.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "smin v31.4s, v31.4s, v28.4s\n"
+ "smin v12.4s, v12.4s, v28.4s\n"
+ "smin v13.4s, v13.4s, v28.4s\n"
+ "smin v14.4s, v14.4s, v28.4s\n"
+ "smin v8.4s, v8.4s, v28.4s\n"
+ "smin v9.4s, v9.4s, v28.4s\n"
+ "smin v10.4s, v10.4s, v28.4s\n"
+ "smin v11.4s, v11.4s, v28.4s\n"
+ "smin v15.4s, v15.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
+ "smax v31.4s, v31.4s, v23.4s\n"
+ "smax v12.4s, v12.4s, v23.4s\n"
+ "smax v13.4s, v13.4s, v23.4s\n"
+ "smax v14.4s, v14.4s, v23.4s\n"
+ "smax v8.4s, v8.4s, v23.4s\n"
+ "smax v9.4s, v9.4s, v23.4s\n"
+ "smax v10.4s, v10.4s, v23.4s\n"
+ "smax v11.4s, v11.4s, v23.4s\n"
+ "smax v15.4s, v15.4s, v23.4s\n"
+ "smax v20.4s, v20.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v23.4s\n"
+ "smax v22.4s, v22.4s, v23.4s\n"
+ "smax v16.4s, v16.4s, v23.4s\n"
+ "smax v17.4s, v17.4s, v23.4s\n"
+ "smax v18.4s, v18.4s, v23.4s\n"
+ "smax v19.4s, v19.4s, v23.4s\n"
+ "smax v24.4s, v24.4s, v23.4s\n"
+ "smax v25.4s, v25.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v23.4s\n"
+ "smax v27.4s, v27.4s, v23.4s\n"
"uzp1 v31.8h, v31.8h, v12.8h\n"
- "uzp1 v12.8h, v13.8h, v14.8h\n"
+ "uzp1 v28.8h, v13.8h, v14.8h\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v23.8h, v10.8h, v11.8h\n"
"uzp1 v15.8h, v15.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "uzp1 v31.16b, v31.16b, v12.16b\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "uzp1 v31.16b, v31.16b, v28.16b\n"
+ "uzp1 v8.16b, v8.16b, v23.16b\n"
"uzp1 v15.16b, v15.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 144f\n"
"tbz x10, #3, 139f\n"
"str d31, [x11], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d15, [x23], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x10, #2, 137f\n"
"st1 { v31.s }[2], [x11], #0x4\n"
- "st1 { v8.s }[2], [x24], #0x4\n"
- "st1 { v15.s }[2], [x23], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x10, #1, 136f\n"
"st1 { v31.h }[6], [x11], #0x2\n"
- "st1 { v8.h }[6], [x24], #0x2\n"
- "st1 { v15.h }[6], [x23], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v15.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x10, #0, 143f\n"
"st1 { v31.b }[14], [x11]\n"
- "st1 { v8.b }[14], [x24]\n"
- "st1 { v15.b }[14], [x23]\n"
- "st1 { v16.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v15.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 143f\n"
"136:" // Height 5: Partial direct writeback: partial_1_12
"tbz x10, #0, 143f\n"
"st1 { v31.b }[12], [x11]\n"
- "st1 { v8.b }[12], [x24]\n"
- "st1 { v15.b }[12], [x23]\n"
- "st1 { v16.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v15.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 143f\n"
"137:" // Height 5: Partial direct writeback: partial_2_8
"tbz x10, #1, 138f\n"
"st1 { v31.h }[4], [x11], #0x2\n"
- "st1 { v8.h }[4], [x24], #0x2\n"
- "st1 { v15.h }[4], [x23], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v15.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x10, #0, 143f\n"
"st1 { v31.b }[10], [x11]\n"
- "st1 { v8.b }[10], [x24]\n"
- "st1 { v15.b }[10], [x23]\n"
- "st1 { v16.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v15.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 143f\n"
"138:" // Height 5: Partial direct writeback: partial_1_8
"tbz x10, #0, 143f\n"
"st1 { v31.b }[8], [x11]\n"
- "st1 { v8.b }[8], [x24]\n"
- "st1 { v15.b }[8], [x23]\n"
- "st1 { v16.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v15.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 143f\n"
"139:" // Height 5: Partial direct writeback: partial_4_0
"tbz x10, #2, 141f\n"
"str s31, [x11], #0x4\n"
- "str s8, [x24], #0x4\n"
- "str s15, [x23], #0x4\n"
- "str s16, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x10, #1, 140f\n"
"st1 { v31.h }[2], [x11], #0x2\n"
- "st1 { v8.h }[2], [x24], #0x2\n"
- "st1 { v15.h }[2], [x23], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v15.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x10, #0, 143f\n"
"st1 { v31.b }[6], [x11]\n"
- "st1 { v8.b }[6], [x24]\n"
- "st1 { v15.b }[6], [x23]\n"
- "st1 { v16.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v15.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 143f\n"
"140:" // Height 5: Partial direct writeback: partial_1_4
"tbz x10, #0, 143f\n"
"st1 { v31.b }[4], [x11]\n"
- "st1 { v8.b }[4], [x24]\n"
- "st1 { v15.b }[4], [x23]\n"
- "st1 { v16.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v15.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 143f\n"
"141:" // Height 5: Partial direct writeback: partial_2_0
"tbz x10, #1, 142f\n"
"str h31, [x11], #0x2\n"
- "str h8, [x24], #0x2\n"
- "str h15, [x23], #0x2\n"
- "str h16, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h15, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x10, #0, 143f\n"
"st1 { v31.b }[2], [x11]\n"
- "st1 { v8.b }[2], [x24]\n"
- "st1 { v15.b }[2], [x23]\n"
- "st1 { v16.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v15.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 143f\n"
"142:" // Height 5: Partial direct writeback: partial_1_0
"str b31, [x11, #0x0]\n"
- "str b8, [x24, #0x0]\n"
- "str b15, [x23, #0x0]\n"
- "str b16, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b15, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"143:" // Height 5: Partial direct writeback: Done
"b 145f\n"
"144:" // Height 5: Full writeback
"str q31, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q8, [x24, #0x0]\n"
- "str q15, [x23, #0x0]\n"
- "str q16, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"145:" // Height 5: Writeback done
"subs x10, x10, #0x10\n"
"bgt 118b\n"
@@ -2801,16 +2800,16 @@ void a64_hybrid_s8qs_mmla_6x16 (
"149:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 150f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 151f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2822,11 +2821,11 @@ void a64_hybrid_s8qs_mmla_6x16 (
"b 151f\n"
"150:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"151:" // Height 6: input setup done
"cmp x27, #0x10\n"
"blt 154f\n"
@@ -2893,42 +2892,42 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr q2, [x25, #0x0]\n"
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ "ldr q6, [x9, #0xa0]\n"
+ ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xb0]\n"
+ ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x9, #0xc0]\n"
+ ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xd0]\n"
+ ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x9, #0xe0]\n"
+ ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n"
"ldr q7, [x9, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n"
"ldr q5, [x22, #0x0]\n"
"ldr q6, [x21, #0x0]\n"
"bge 152b\n"
@@ -2984,87 +2983,87 @@ void a64_hybrid_s8qs_mmla_6x16 (
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x90]\n"
+ "ldr q0, [x9, #0x90]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x9, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x9, #0xf0]\n"
+ "ldr q2, [x9, #0xa0]\n"
+ ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xb0]\n"
+ ".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x9, #0xc0]\n"
+ ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xd0]\n"
+ ".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x9, #0xe0]\n"
+ ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x9, #0xf0]\n"
"add x9, x9, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n"
+ ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n"
"154:" // Height 6: Multiply loop: Main loop skip
"cbz x27, 161f\n"
"cmp x27, #0x8\n"
"blt 156f\n"
"155:" // Height 6: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x8\n"
- "ldr d5, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- "ldr q6, [x9, #0x0]\n"
- "ldr q7, [x9, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x70]\n"
+ "ldr d1, [x22], #0x8\n"
+ "ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
+ "ldr q1, [x9, #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x20]\n"
+ ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x30]\n"
+ ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x40]\n"
+ ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x50]\n"
+ ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x9, #0x60]\n"
+ ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x9, #0x70]\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n"
+ ".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n"
+ ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n"
"bge 155b\n"
"156:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 161f\n"
@@ -3124,77 +3123,77 @@ void a64_hybrid_s8qs_mmla_6x16 (
"ldr b6, [x21, #0x0]\n"
"160:" // Height 6: Multiply loop: Ragged operand read: Done
"ldr q7, [x9, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x9, #0x10]\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x9, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x9, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "trn1 v2.2d, v1.2d, v2.2d\n"
+ "trn1 v4.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e87a448 // smmla v8.4s, v2.16b, v7.16b\n"
+ "trn1 v3.2d, v5.2d, v6.2d\n"
+ "ldr q0, [x9, #0x10]\n"
+ ".inst 0x4e87a490 // smmla v16.4s, v4.16b, v7.16b\n"
+ ".inst 0x4e87a478 // smmla v24.4s, v3.16b, v7.16b\n"
+ "ldr q1, [x9, #0x20]\n"
+ ".inst 0x4e80a44c // smmla v12.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e80a494 // smmla v20.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a47c // smmla v28.4s, v3.16b, v0.16b\n"
+ "ldr q0, [x9, #0x30]\n"
+ ".inst 0x4e81a449 // smmla v9.4s, v2.16b, v1.16b\n"
+ ".inst 0x4e81a491 // smmla v17.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a479 // smmla v25.4s, v3.16b, v1.16b\n"
+ "ldr q1, [x9, #0x40]\n"
+ ".inst 0x4e80a44d // smmla v13.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e80a495 // smmla v21.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a47d // smmla v29.4s, v3.16b, v0.16b\n"
+ "ldr q0, [x9, #0x50]\n"
+ ".inst 0x4e81a44a // smmla v10.4s, v2.16b, v1.16b\n"
+ ".inst 0x4e81a492 // smmla v18.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a47a // smmla v26.4s, v3.16b, v1.16b\n"
+ "ldr q1, [x9, #0x60]\n"
+ ".inst 0x4e80a44e // smmla v14.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e80a496 // smmla v22.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a47e // smmla v30.4s, v3.16b, v0.16b\n"
+ "ldr q0, [x9, #0x70]\n"
+ ".inst 0x4e81a44b // smmla v11.4s, v2.16b, v1.16b\n"
"add x9, x9, #0x80\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
+ ".inst 0x4e81a493 // smmla v19.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a47b // smmla v27.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e80a44f // smmla v15.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e80a497 // smmla v23.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a47f // smmla v31.4s, v3.16b, v0.16b\n"
"161:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 149b\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x14, #0x10]\n"
- "uzp1 v7.2d, v8.2d, v12.2d\n"
+ "ldr q4, [x14, #0x0]\n"
+ "ldr q3, [x14, #0x10]\n"
+ "uzp1 v2.2d, v8.2d, v12.2d\n"
"uzp2 v8.2d, v8.2d, v12.2d\n"
- "ldr q2, [x14, #0x20]\n"
- "ldr q3, [x14, #0x30]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q0, [x14, #0x30]\n"
"uzp1 v12.2d, v9.2d, v13.2d\n"
"uzp2 v9.2d, v9.2d, v13.2d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x11, x20\n"
+ "add x24, x25, x20\n"
"uzp1 v13.2d, v10.2d, v14.2d\n"
"uzp2 v10.2d, v10.2d, v14.2d\n"
"uzp1 v14.2d, v11.2d, v15.2d\n"
+ "add x23, x24, x20\n"
"add x22, x23, x20\n"
- "add x21, x22, x20\n"
"uzp2 v11.2d, v11.2d, v15.2d\n"
"uzp1 v15.2d, v16.2d, v20.2d\n"
- "add x20, x21, x20\n"
+ "add x21, x22, x20\n"
"prfm pstl1keep, [x11, #0x0]\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
"prfm pstl1keep, [x24, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x20, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"add x14, x14, #0x40\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
@@ -3205,31 +3204,31 @@ void a64_hybrid_s8qs_mmla_6x16 (
"uzp2 v26.2d, v26.2d, v30.2d\n"
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v7.16b\n"
- "add v31.4s, v31.4s, v0.4s\n"
- "add v12.4s, v12.4s, v1.4s\n"
- "add v13.4s, v13.4s, v2.4s\n"
- "add v14.4s, v14.4s, v3.4s\n"
- "add v8.4s, v8.4s, v0.4s\n"
- "add v9.4s, v9.4s, v1.4s\n"
- "add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
- "add v15.4s, v15.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v28.4s, v28.4s, v1.4s\n"
- "add v29.4s, v29.4s, v2.4s\n"
- "add v30.4s, v30.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
+ "mov v31.16b, v2.16b\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v0.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v3.4s\n"
+ "add v10.4s, v10.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v0.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
"tbz %x[flags], #4, 162f\n"
"ldr q0, [x12, #0x0]\n"
"ldr q4, [x13, #0x0]\n"
@@ -3243,10 +3242,10 @@ void a64_hybrid_s8qs_mmla_6x16 (
"add x13, x13, #0x40\n"
"b 163f\n"
"162:" // Height 6: per layer parameters
- "add x25, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x25]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x20]\n"
"mov v1.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v2.16b, v0.16b\n"
@@ -3279,91 +3278,91 @@ void a64_hybrid_s8qs_mmla_6x16 (
"sqrdmulh v26.4s, v26.4s, v6.4s\n"
"sqrdmulh v27.4s, v27.4s, v7.4s\n"
"tbz %x[flags], #5, 164f\n"
- "and v4.16b, v31.16b, v0.16b\n"
- "and v5.16b, v12.16b, v1.16b\n"
- "and v6.16b, v13.16b, v2.16b\n"
- "and v7.16b, v14.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v31.16b, v0.16b\n"
+ "and v6.16b, v12.16b, v1.16b\n"
+ "and v5.16b, v13.16b, v2.16b\n"
+ "and v4.16b, v14.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v4.4s\n"
- "sqadd v12.4s, v12.4s, v5.4s\n"
- "sqadd v13.4s, v13.4s, v6.4s\n"
- "sqadd v14.4s, v14.4s, v7.4s\n"
- "and v4.16b, v8.16b, v0.16b\n"
- "and v5.16b, v9.16b, v1.16b\n"
- "and v6.16b, v10.16b, v2.16b\n"
- "and v7.16b, v11.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v4.4s\n"
- "sqadd v9.4s, v9.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v6.4s\n"
- "sqadd v11.4s, v11.4s, v7.4s\n"
- "and v4.16b, v15.16b, v0.16b\n"
- "and v5.16b, v20.16b, v1.16b\n"
- "and v6.16b, v21.16b, v2.16b\n"
- "and v7.16b, v22.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v1.16b\n"
- "and v6.16b, v18.16b, v2.16b\n"
- "and v7.16b, v19.16b, v3.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v7.4s\n"
+ "sqadd v12.4s, v12.4s, v6.4s\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sqadd v14.4s, v14.4s, v4.4s\n"
+ "and v7.16b, v8.16b, v0.16b\n"
+ "and v6.16b, v9.16b, v1.16b\n"
+ "and v5.16b, v10.16b, v2.16b\n"
+ "and v4.16b, v11.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v28.16b, v1.16b\n"
- "and v6.16b, v29.16b, v2.16b\n"
- "and v7.16b, v30.16b, v3.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v7.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v5.4s\n"
+ "sqadd v11.4s, v11.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v0.16b\n"
+ "and v6.16b, v20.16b, v1.16b\n"
+ "and v5.16b, v21.16b, v2.16b\n"
+ "and v4.16b, v22.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v28.4s, v28.4s, v5.4s\n"
- "sqadd v29.4s, v29.4s, v6.4s\n"
- "sqadd v30.4s, v30.4s, v7.4s\n"
- "and v4.16b, v24.16b, v0.16b\n"
- "and v5.16b, v25.16b, v1.16b\n"
- "and v6.16b, v26.16b, v2.16b\n"
- "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "and v7.16b, v16.16b, v0.16b\n"
+ "and v6.16b, v17.16b, v1.16b\n"
+ "and v5.16b, v18.16b, v2.16b\n"
+ "and v4.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v7.4s\n"
+ "sqadd v17.4s, v17.4s, v6.4s\n"
+ "sqadd v18.4s, v18.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v0.16b\n"
+ "and v6.16b, v28.16b, v1.16b\n"
+ "and v5.16b, v29.16b, v2.16b\n"
+ "and v4.16b, v30.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
"sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "sqadd v28.4s, v28.4s, v6.4s\n"
+ "sqadd v29.4s, v29.4s, v5.4s\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "and v7.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v1.16b\n"
+ "and v5.16b, v26.16b, v2.16b\n"
+ "and v4.16b, v27.16b, v3.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v4.4s\n"
- "sqadd v25.4s, v25.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v6.4s\n"
- "sqadd v27.4s, v27.4s, v7.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v7.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
"164:" // Height 6: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v6.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v12.4s, v12.4s, v1.4s\n"
"srshl v13.4s, v13.4s, v2.4s\n"
"srshl v14.4s, v14.4s, v3.4s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x20]\n"
"srshl v8.4s, v8.4s, v0.4s\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x25]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x20]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
"srshl v11.4s, v11.4s, v3.4s\n"
"cmp x10, #0x10\n"
@@ -3383,225 +3382,225 @@ void a64_hybrid_s8qs_mmla_6x16 (
"srshl v25.4s, v25.4s, v1.4s\n"
"srshl v26.4s, v26.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add v12.4s, v12.4s, v4.4s\n"
- "add v13.4s, v13.4s, v4.4s\n"
- "add v14.4s, v14.4s, v4.4s\n"
- "add v8.4s, v8.4s, v4.4s\n"
- "add v9.4s, v9.4s, v4.4s\n"
- "add v10.4s, v10.4s, v4.4s\n"
- "add v11.4s, v11.4s, v4.4s\n"
- "add v15.4s, v15.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smin v12.4s, v12.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v6.4s\n"
- "smin v14.4s, v14.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v6.4s\n"
- "smin v9.4s, v9.4s, v6.4s\n"
- "smin v10.4s, v10.4s, v6.4s\n"
- "smin v11.4s, v11.4s, v6.4s\n"
- "smin v15.4s, v15.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
- "smax v12.4s, v12.4s, v5.4s\n"
- "smax v13.4s, v13.4s, v5.4s\n"
- "smax v14.4s, v14.4s, v5.4s\n"
- "smax v8.4s, v8.4s, v5.4s\n"
- "smax v9.4s, v9.4s, v5.4s\n"
- "smax v10.4s, v10.4s, v5.4s\n"
- "smax v11.4s, v11.4s, v5.4s\n"
- "smax v15.4s, v15.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v31.4s, v31.4s, v6.4s\n"
+ "add v12.4s, v12.4s, v6.4s\n"
+ "add v13.4s, v13.4s, v6.4s\n"
+ "add v14.4s, v14.4s, v6.4s\n"
+ "add v8.4s, v8.4s, v6.4s\n"
+ "add v9.4s, v9.4s, v6.4s\n"
+ "add v10.4s, v10.4s, v6.4s\n"
+ "add v11.4s, v11.4s, v6.4s\n"
+ "add v15.4s, v15.4s, v6.4s\n"
+ "add v20.4s, v20.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v6.4s\n"
+ "add v22.4s, v22.4s, v6.4s\n"
+ "add v16.4s, v16.4s, v6.4s\n"
+ "add v17.4s, v17.4s, v6.4s\n"
+ "add v18.4s, v18.4s, v6.4s\n"
+ "add v19.4s, v19.4s, v6.4s\n"
+ "add v23.4s, v23.4s, v6.4s\n"
+ "add v28.4s, v28.4s, v6.4s\n"
+ "add v29.4s, v29.4s, v6.4s\n"
+ "add v30.4s, v30.4s, v6.4s\n"
+ "add v24.4s, v24.4s, v6.4s\n"
+ "add v25.4s, v25.4s, v6.4s\n"
+ "add v26.4s, v26.4s, v6.4s\n"
+ "add v27.4s, v27.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v5.4s\n"
+ "smin v12.4s, v12.4s, v5.4s\n"
+ "smin v13.4s, v13.4s, v5.4s\n"
+ "smin v14.4s, v14.4s, v5.4s\n"
+ "smin v8.4s, v8.4s, v5.4s\n"
+ "smin v9.4s, v9.4s, v5.4s\n"
+ "smin v10.4s, v10.4s, v5.4s\n"
+ "smin v11.4s, v11.4s, v5.4s\n"
+ "smin v15.4s, v15.4s, v5.4s\n"
+ "smin v20.4s, v20.4s, v5.4s\n"
+ "smin v21.4s, v21.4s, v5.4s\n"
+ "smin v22.4s, v22.4s, v5.4s\n"
+ "smin v16.4s, v16.4s, v5.4s\n"
+ "smin v17.4s, v17.4s, v5.4s\n"
+ "smin v18.4s, v18.4s, v5.4s\n"
+ "smin v19.4s, v19.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v5.4s\n"
+ "smin v28.4s, v28.4s, v5.4s\n"
+ "smin v29.4s, v29.4s, v5.4s\n"
+ "smin v30.4s, v30.4s, v5.4s\n"
+ "smin v24.4s, v24.4s, v5.4s\n"
+ "smin v25.4s, v25.4s, v5.4s\n"
+ "smin v26.4s, v26.4s, v5.4s\n"
+ "smin v27.4s, v27.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v4.4s\n"
+ "smax v12.4s, v12.4s, v4.4s\n"
+ "smax v13.4s, v13.4s, v4.4s\n"
+ "smax v14.4s, v14.4s, v4.4s\n"
+ "smax v8.4s, v8.4s, v4.4s\n"
+ "smax v9.4s, v9.4s, v4.4s\n"
+ "smax v10.4s, v10.4s, v4.4s\n"
+ "smax v11.4s, v11.4s, v4.4s\n"
+ "smax v15.4s, v15.4s, v4.4s\n"
+ "smax v20.4s, v20.4s, v4.4s\n"
+ "smax v21.4s, v21.4s, v4.4s\n"
+ "smax v22.4s, v22.4s, v4.4s\n"
+ "smax v16.4s, v16.4s, v4.4s\n"
+ "smax v17.4s, v17.4s, v4.4s\n"
+ "smax v18.4s, v18.4s, v4.4s\n"
+ "smax v19.4s, v19.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v4.4s\n"
+ "smax v28.4s, v28.4s, v4.4s\n"
+ "smax v29.4s, v29.4s, v4.4s\n"
+ "smax v30.4s, v30.4s, v4.4s\n"
+ "smax v24.4s, v24.4s, v4.4s\n"
+ "smax v25.4s, v25.4s, v4.4s\n"
+ "smax v26.4s, v26.4s, v4.4s\n"
+ "smax v27.4s, v27.4s, v4.4s\n"
"uzp1 v31.8h, v31.8h, v12.8h\n"
- "uzp1 v12.8h, v13.8h, v14.8h\n"
+ "uzp1 v1.8h, v13.8h, v14.8h\n"
"uzp1 v8.8h, v8.8h, v9.8h\n"
- "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v0.8h, v10.8h, v11.8h\n"
"uzp1 v15.8h, v15.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v23.8h, v23.8h, v28.8h\n"
- "uzp1 v28.8h, v29.8h, v30.8h\n"
+ "uzp1 v18.8h, v29.8h, v30.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "uzp1 v31.16b, v31.16b, v12.16b\n"
- "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "uzp1 v31.16b, v31.16b, v1.16b\n"
+ "uzp1 v8.16b, v8.16b, v0.16b\n"
"uzp1 v15.16b, v15.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v23.16b, v23.16b, v28.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v23.16b, v23.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 173f\n"
"tbz x10, #3, 168f\n"
"str d31, [x11], #0x8\n"
- "str d8, [x24], #0x8\n"
- "str d15, [x23], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d23, [x21], #0x8\n"
- "str d24, [x20], #0x8\n"
+ "str d8, [x25], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x10, #2, 166f\n"
"st1 { v31.s }[2], [x11], #0x4\n"
- "st1 { v8.s }[2], [x24], #0x4\n"
- "st1 { v15.s }[2], [x23], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
- "st1 { v23.s }[2], [x21], #0x4\n"
- "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v8.s }[2], [x25], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
"tbz x10, #1, 165f\n"
"st1 { v31.h }[6], [x11], #0x2\n"
- "st1 { v8.h }[6], [x24], #0x2\n"
- "st1 { v15.h }[6], [x23], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
- "st1 { v23.h }[6], [x21], #0x2\n"
- "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v8.h }[6], [x25], #0x2\n"
+ "st1 { v15.h }[6], [x24], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v23.h }[6], [x22], #0x2\n"
+ "st1 { v24.h }[6], [x21], #0x2\n"
"tbz x10, #0, 172f\n"
"st1 { v31.b }[14], [x11]\n"
- "st1 { v8.b }[14], [x24]\n"
- "st1 { v15.b }[14], [x23]\n"
- "st1 { v16.b }[14], [x22]\n"
- "st1 { v23.b }[14], [x21]\n"
- "st1 { v24.b }[14], [x20]\n"
+ "st1 { v8.b }[14], [x25]\n"
+ "st1 { v15.b }[14], [x24]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v23.b }[14], [x22]\n"
+ "st1 { v24.b }[14], [x21]\n"
"b 172f\n"
"165:" // Height 6: Partial direct writeback: partial_1_12
"tbz x10, #0, 172f\n"
"st1 { v31.b }[12], [x11]\n"
- "st1 { v8.b }[12], [x24]\n"
- "st1 { v15.b }[12], [x23]\n"
- "st1 { v16.b }[12], [x22]\n"
- "st1 { v23.b }[12], [x21]\n"
- "st1 { v24.b }[12], [x20]\n"
+ "st1 { v8.b }[12], [x25]\n"
+ "st1 { v15.b }[12], [x24]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v23.b }[12], [x22]\n"
+ "st1 { v24.b }[12], [x21]\n"
"b 172f\n"
"166:" // Height 6: Partial direct writeback: partial_2_8
"tbz x10, #1, 167f\n"
"st1 { v31.h }[4], [x11], #0x2\n"
- "st1 { v8.h }[4], [x24], #0x2\n"
- "st1 { v15.h }[4], [x23], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
- "st1 { v23.h }[4], [x21], #0x2\n"
- "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v8.h }[4], [x25], #0x2\n"
+ "st1 { v15.h }[4], [x24], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v23.h }[4], [x22], #0x2\n"
+ "st1 { v24.h }[4], [x21], #0x2\n"
"tbz x10, #0, 172f\n"
"st1 { v31.b }[10], [x11]\n"
- "st1 { v8.b }[10], [x24]\n"
- "st1 { v15.b }[10], [x23]\n"
- "st1 { v16.b }[10], [x22]\n"
- "st1 { v23.b }[10], [x21]\n"
- "st1 { v24.b }[10], [x20]\n"
+ "st1 { v8.b }[10], [x25]\n"
+ "st1 { v15.b }[10], [x24]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v23.b }[10], [x22]\n"
+ "st1 { v24.b }[10], [x21]\n"
"b 172f\n"
"167:" // Height 6: Partial direct writeback: partial_1_8
"tbz x10, #0, 172f\n"
"st1 { v31.b }[8], [x11]\n"
- "st1 { v8.b }[8], [x24]\n"
- "st1 { v15.b }[8], [x23]\n"
- "st1 { v16.b }[8], [x22]\n"
- "st1 { v23.b }[8], [x21]\n"
- "st1 { v24.b }[8], [x20]\n"
+ "st1 { v8.b }[8], [x25]\n"
+ "st1 { v15.b }[8], [x24]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v23.b }[8], [x22]\n"
+ "st1 { v24.b }[8], [x21]\n"
"b 172f\n"
"168:" // Height 6: Partial direct writeback: partial_4_0
"tbz x10, #2, 170f\n"
"str s31, [x11], #0x4\n"
- "str s8, [x24], #0x4\n"
- "str s15, [x23], #0x4\n"
- "str s16, [x22], #0x4\n"
- "str s23, [x21], #0x4\n"
- "str s24, [x20], #0x4\n"
+ "str s8, [x25], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
"tbz x10, #1, 169f\n"
"st1 { v31.h }[2], [x11], #0x2\n"
- "st1 { v8.h }[2], [x24], #0x2\n"
- "st1 { v15.h }[2], [x23], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
- "st1 { v23.h }[2], [x21], #0x2\n"
- "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v8.h }[2], [x25], #0x2\n"
+ "st1 { v15.h }[2], [x24], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v23.h }[2], [x22], #0x2\n"
+ "st1 { v24.h }[2], [x21], #0x2\n"
"tbz x10, #0, 172f\n"
"st1 { v31.b }[6], [x11]\n"
- "st1 { v8.b }[6], [x24]\n"
- "st1 { v15.b }[6], [x23]\n"
- "st1 { v16.b }[6], [x22]\n"
- "st1 { v23.b }[6], [x21]\n"
- "st1 { v24.b }[6], [x20]\n"
+ "st1 { v8.b }[6], [x25]\n"
+ "st1 { v15.b }[6], [x24]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v23.b }[6], [x22]\n"
+ "st1 { v24.b }[6], [x21]\n"
"b 172f\n"
"169:" // Height 6: Partial direct writeback: partial_1_4
"tbz x10, #0, 172f\n"
"st1 { v31.b }[4], [x11]\n"
- "st1 { v8.b }[4], [x24]\n"
- "st1 { v15.b }[4], [x23]\n"
- "st1 { v16.b }[4], [x22]\n"
- "st1 { v23.b }[4], [x21]\n"
- "st1 { v24.b }[4], [x20]\n"
+ "st1 { v8.b }[4], [x25]\n"
+ "st1 { v15.b }[4], [x24]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v23.b }[4], [x22]\n"
+ "st1 { v24.b }[4], [x21]\n"
"b 172f\n"
"170:" // Height 6: Partial direct writeback: partial_2_0
"tbz x10, #1, 171f\n"
"str h31, [x11], #0x2\n"
- "str h8, [x24], #0x2\n"
- "str h15, [x23], #0x2\n"
- "str h16, [x22], #0x2\n"
- "str h23, [x21], #0x2\n"
- "str h24, [x20], #0x2\n"
+ "str h8, [x25], #0x2\n"
+ "str h15, [x24], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h23, [x22], #0x2\n"
+ "str h24, [x21], #0x2\n"
"tbz x10, #0, 172f\n"
"st1 { v31.b }[2], [x11]\n"
- "st1 { v8.b }[2], [x24]\n"
- "st1 { v15.b }[2], [x23]\n"
- "st1 { v16.b }[2], [x22]\n"
- "st1 { v23.b }[2], [x21]\n"
- "st1 { v24.b }[2], [x20]\n"
+ "st1 { v8.b }[2], [x25]\n"
+ "st1 { v15.b }[2], [x24]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "st1 { v24.b }[2], [x21]\n"
"b 172f\n"
"171:" // Height 6: Partial direct writeback: partial_1_0
"str b31, [x11, #0x0]\n"
- "str b8, [x24, #0x0]\n"
- "str b15, [x23, #0x0]\n"
- "str b16, [x22, #0x0]\n"
- "str b23, [x21, #0x0]\n"
- "str b24, [x20, #0x0]\n"
+ "str b8, [x25, #0x0]\n"
+ "str b15, [x24, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b23, [x22, #0x0]\n"
+ "str b24, [x21, #0x0]\n"
"172:" // Height 6: Partial direct writeback: Done
"b 174f\n"
"173:" // Height 6: Full writeback
"str q31, [x11, #0x0]\n"
"add x11, x11, #0x10\n"
- "str q8, [x24, #0x0]\n"
- "str q15, [x23, #0x0]\n"
- "str q16, [x22, #0x0]\n"
- "str q23, [x21, #0x0]\n"
- "str q24, [x20, #0x0]\n"
+ "str q8, [x25, #0x0]\n"
+ "str q15, [x24, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q24, [x21, #0x0]\n"
"174:" // Height 6: Writeback done
"subs x10, x10, #0x10\n"
"bgt 147b\n"
@@ -3617,7 +3616,6 @@ void a64_hybrid_s8qs_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"176:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index 48ce67613e..a02fbe8f28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -79,12 +79,12 @@ public:
switch (ci->get_cpu_model()) {
default:
return { 31.65 };
- case CPUModel::A55r1:
- return { 9.217 };
case CPUModel::A510:
return { 15.87 };
case CPUModel::V1:
return { 54.50 };
+ case CPUModel::A55r1:
+ return { 9.217 };
}
}
@@ -97,7 +97,7 @@ public:
case CPUModel::A510:
return { 16.66, 3.92, 0.48 };
case CPUModel::V1:
- return { 55.40, 19.21, 0.93 };
+ return { 42.62, 16.32, 0.83 };
}
}
@@ -121,5 +121,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
index 8046b2ebb0..289d38c3b6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
@@ -77,7 +77,6 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
ka.N = N;
ka.B_ptr = B_ptr;
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 171f\n"
@@ -165,11 +164,11 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
"cbnz x15, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
@@ -186,129 +185,129 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"blt 18f\n"
"17:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr d17, [x16, #0x20]\n"
+ "ldr x20, [x16, #0x28]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x38]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- "ldr x12, [x16, #0x48]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x58]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x78]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x98]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xb8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xd8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xf8]\n"
- "mov v7.d[1], x11\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x38]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr d17, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr d16, [x16, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr d17, [x16, #0x60]\n"
+ "ldr x20, [x16, #0x68]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr d17, [x16, #0x80]\n"
+ "ldr x20, [x16, #0x88]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr d16, [x16, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr d17, [x16, #0xa0]\n"
+ "ldr x20, [x16, #0xa8]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr d17, [x16, #0xc0]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr d16, [x16, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr d17, [x16, #0xe0]\n"
+ "ldr x20, [x16, #0xe8]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "mov v16.d[1], x20\n"
"add x13, x13, #0x10\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0x8]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
"sub x14, x14, #0x10\n"
"ldr d7, [x16, #0x10]\n"
"cmp x14, #0x20\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x18]\n"
- "mov v0.d[1], x10\n"
- "mov v7.d[1], x11\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"prfm pldl1keep, [x13, #0x80]\n"
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x16, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x16, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x16, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x16, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x16, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x16, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x16, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x16, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x16, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x16, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x16, #0xf0]\n"
"add x13, x13, #0x10\n"
"sub x14, x14, #0x10\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 24f\n"
"cmp x14, #0x4\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s18, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
+ "ldr q17, [x16, #0x20]\n"
"cmp x14, #0x4\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
"add x16, x16, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
@@ -321,14 +320,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x4f80e208 // sdot v8.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x20]\n"
+ ".inst 0x4f80e20a // sdot v10.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -499,226 +498,226 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
"cbnz x15, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
- "add x9, x9, x20\n"
+ "add x12, x12, x20\n"
"b 50f\n"
"49:" // Height 2: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
+ "add x12, x13, x21\n"
"50:" // Height 2: input setup done
"cmp x14, #0x10\n"
"blt 53f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 52f\n"
"51:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d17, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x58]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x98]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xd8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v6.d[1], x12\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr d17, [x16, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr x20, [x16, #0x48]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr d16, [x16, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr d17, [x16, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr d17, [x16, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr x20, [x16, #0x88]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr d16, [x16, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr d17, [x16, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr d17, [x16, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr d16, [x16, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr d17, [x16, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x21\n"
"add x13, x13, #0x10\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
"sub x14, x14, #0x10\n"
"ldr d7, [x16, #0x10]\n"
"cmp x14, #0x20\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x28, [x9, #0x8]\n"
- "mov v0.d[1], x10\n"
- "ldr x11, [x16, #0x18]\n"
- "mov v1.d[1], x28\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v1.d[1], x21\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v7.d[1], x11\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"sub x14, x14, #0x10\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x16, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x16, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x16, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x16, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x16, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x16, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x16, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x16, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x16, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x16, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x16, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x16, #0xf0]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"53:" // Height 2: Multiply loop: Main loop skip
"cbz x14, 58f\n"
"cmp x14, #0x4\n"
"blt 55f\n"
"54:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s19, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q17, [x16, #0x0]\n"
+ ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
+ "ldr q17, [x16, #0x20]\n"
+ ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
"bge 54b\n"
"55:" // Height 2: Multiply loop: Skip odd blocks
"cbz x14, 58f\n"
"tbz x14, #1, 56f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
"tbz x14, #0, 57f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x12]\n"
"b 57f\n"
"56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
"57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q17, [x16, #0x0]\n"
+ ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x16, #0x20]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"58:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -936,281 +935,281 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
"cbnz x15, 84f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
- "add x9, x9, x20\n"
- "add x27, x27, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
"b 84f\n"
"83:" // Height 3: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
"84:" // Height 3: input setup done
"cmp x14, #0x10\n"
"blt 87f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 86f\n"
"85:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d21, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v21.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr d20, [x16, #0x30]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr d21, [x16, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr d20, [x16, #0x50]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr d21, [x16, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr d20, [x16, #0x70]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr d21, [x16, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr d20, [x16, #0x90]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr d21, [x16, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr d20, [x16, #0xb0]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xd8]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr d21, [x16, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr d20, [x16, #0xd0]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr d21, [x16, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
"add x13, x13, #0x10\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr d20, [x16, #0xf0]\n"
+ "mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x10, [x13, #0x8]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0x8]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ "ldr x23, [x13, #0x8]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
+ "ldr d2, [x11, #0x0]\n"
"sub x14, x14, #0x10\n"
"ldr d7, [x16, #0x10]\n"
"cmp x14, #0x20\n"
- "ldr x26, [x27, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x18]\n"
- "mov v0.d[1], x10\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x23\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v1.d[1], x28\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "mov v2.d[1], x26\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "mov v7.d[1], x11\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v7.d[1], x20\n"
"bge 85b\n"
"86:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q20, [x16, #0x30]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x16, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x16, #0x50]\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x16, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x16, #0x70]\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x16, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x16, #0x90]\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x16, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x16, #0xb0]\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x16, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x16, #0xd0]\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x16, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x16, #0xf0]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"87:" // Height 3: Multiply loop: Main loop skip
"cbz x14, 92f\n"
"cmp x14, #0x4\n"
"blt 89f\n"
"88:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s24, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s23, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s22, [x11], #0x4\n"
+ "ldr q21, [x16, #0x0]\n"
+ ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
+ ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
+ "ldr q21, [x16, #0x20]\n"
+ ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
+ "ldr q20, [x16, #0x30]\n"
+ ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
+ ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n"
"bge 88b\n"
"89:" // Height 3: Multiply loop: Skip odd blocks
"cbz x14, 92f\n"
"tbz x14, #1, 90f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
"tbz x14, #0, 91f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
"b 91f\n"
"90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
"91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q21, [x16, #0x0]\n"
+ ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
+ ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x16, #0x20]\n"
+ ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x16, #0x30]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
"92:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -1475,336 +1474,336 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
"cbnz x15, 118f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
- "add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"b 118f\n"
"117:" // Height 4: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
"118:" // Height 4: input setup done
"cmp x14, #0x10\n"
"blt 121f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 120f\n"
"119:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d25, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v25.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "add x25, x25, #0x10\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr x10, [x13, #0x8]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr x26, [x27, #0x8]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr x24, [x25, #0x8]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr d24, [x16, #0x30]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr d25, [x16, #0x40]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr d24, [x16, #0x50]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ "ldr x25, [x13, #0x8]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr d25, [x16, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ "ldr x24, [x12, #0x8]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr d24, [x16, #0x70]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ "ldr x23, [x11, #0x8]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr d25, [x16, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ "ldr x22, [x10, #0x8]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr d24, [x16, #0x90]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
"sub x14, x14, #0x10\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr d25, [x16, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
"cmp x14, #0x20\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr d24, [x16, #0xb0]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xd8]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr d25, [x16, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr d24, [x16, #0xd0]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr d25, [x16, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr d24, [x16, #0xf0]\n"
+ "mov v24.d[1], x20\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0x18]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0x18]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
+ "ldr d2, [x11, #0x0]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
+ "ldr d3, [x10, #0x0]\n"
"ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
- "mov v7.d[1], x11\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x25\n"
+ "mov v1.d[1], x24\n"
+ "mov v2.d[1], x23\n"
+ "mov v3.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 119b\n"
"120:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q24, [x16, #0x30]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x16, #0x40]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x16, #0x50]\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x16, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x16, #0x70]\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x16, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x16, #0x90]\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x16, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x16, #0xb0]\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x16, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x16, #0xd0]\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x16, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x16, #0xf0]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"121:" // Height 4: Multiply loop: Main loop skip
"cbz x14, 126f\n"
"cmp x14, #0x4\n"
"blt 123f\n"
"122:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s29, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s28, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s27, [x11], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr q25, [x16, #0x0]\n"
+ ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
+ ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
+ "ldr q25, [x16, #0x20]\n"
+ ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
+ "ldr q24, [x16, #0x30]\n"
+ ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n"
+ ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n"
"bge 122b\n"
"123:" // Height 4: Multiply loop: Skip odd blocks
"cbz x14, 126f\n"
"tbz x14, #1, 124f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
"tbz x14, #0, 125f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
+ "ld1 { v3.b }[2], [x10]\n"
"b 125f\n"
"124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
+ "ldr b3, [x10, #0x0]\n"
"125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q25, [x16, #0x0]\n"
+ ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
+ ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x16, #0x20]\n"
+ ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x16, #0x30]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
"126:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -2116,391 +2115,391 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
"cbnz x15, 152f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
- "add x23, x23, x20\n"
"b 152f\n"
"151:" // Height 5: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
- "add x23, x25, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
"152:" // Height 5: input setup done
"cmp x14, #0x10\n"
"blt 155f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 154f\n"
"153:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d29, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v29.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x10, [x13, #0x8]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x26, [x27, #0x8]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr x24, [x25, #0x8]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr x22, [x23, #0x8]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr d28, [x16, #0x30]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr d29, [x16, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ "ldr x25, [x12, #0x8]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ "ldr x24, [x11, #0x8]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr d28, [x16, #0x50]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ "ldr x22, [x9, #0x8]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr d29, [x16, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
"sub x14, x14, #0x10\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
"cmp x14, #0x20\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr d28, [x16, #0x70]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr d29, [x16, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr d28, [x16, #0x90]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr d29, [x16, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr d28, [x16, #0xb0]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xd8]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr d29, [x16, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr d28, [x16, #0xd0]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr d29, [x16, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr d28, [x16, #0xf0]\n"
+ "mov v28.d[1], x20\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0x18]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0x18]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
+ "ldr d2, [x11, #0x0]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
+ "ldr d3, [x10, #0x0]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
+ "ldr d4, [x9, #0x0]\n"
"ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x26\n"
+ "mov v1.d[1], x25\n"
+ "mov v2.d[1], x24\n"
+ "mov v3.d[1], x23\n"
"mov v4.d[1], x22\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"bge 153b\n"
"154:" // Height 5: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q28, [x16, #0x30]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x16, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x16, #0x50]\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x16, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x16, #0x70]\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x16, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x16, #0x90]\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x16, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x16, #0xb0]\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x16, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x16, #0xd0]\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x16, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x16, #0xf0]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"155:" // Height 5: Multiply loop: Main loop skip
"cbz x14, 160f\n"
"cmp x14, #0x4\n"
"blt 157f\n"
"156:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s2, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s0, [x11], #0x4\n"
+ "ldr s31, [x10], #0x4\n"
+ "ldr s30, [x9], #0x4\n"
+ "ldr q29, [x16, #0x0]\n"
+ ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
+ "ldr q29, [x16, #0x20]\n"
+ ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
+ "ldr q28, [x16, #0x30]\n"
+ ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n"
+ ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n"
"bge 156b\n"
"157:" // Height 5: Multiply loop: Skip odd blocks
"cbz x14, 160f\n"
"tbz x14, #1, 158f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h4, [x9], #0x2\n"
"tbz x14, #0, 159f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
+ "ld1 { v3.b }[2], [x10]\n"
+ "ld1 { v4.b }[2], [x9]\n"
"b 159f\n"
"158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
+ "ldr b3, [x10, #0x0]\n"
+ "ldr b4, [x9, #0x0]\n"
"159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q29, [x16, #0x0]\n"
+ ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x16, #0x20]\n"
+ ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x16, #0x30]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
"160:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -2862,98 +2861,98 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
+ "ldr x28, [x20, #0x28]\n"
"cbnz x15, 186f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
- "add x23, x23, x20\n"
- "add x21, x21, x20\n"
+ "add x28, x28, x20\n"
"b 186f\n"
"185:" // Height 6: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
- "add x23, x25, x20\n"
- "add x21, x23, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
+ "add x28, x9, x21\n"
"186:" // Height 6: input setup done
"cmp x14, #0x10\n"
"blt 189f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x21, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
+ "ldr q5, [x28, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 188f\n"
"187:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
"ldr d6, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
"ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr x10, [x13, #0x8]\n"
+ "ldr x27, [x13, #0x8]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr x25, [x11, #0x8]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
"ldr d6, [x16, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr x24, [x25, #0x8]\n"
+ "ldr x24, [x10, #0x8]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x22, [x23, #0x8]\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr x20, [x21, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
"ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
"sub x14, x14, #0x10\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
@@ -2963,96 +2962,96 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
"ldr d6, [x16, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
"ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
"ldr d6, [x16, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
"ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
"ldr d6, [x16, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
"ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
"ldr d6, [x16, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
"ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
"ldr d6, [x16, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
"ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"add x16, x16, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0x18]\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
@@ -3061,56 +3060,56 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x12, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x11, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ "ldr d3, [x10, #0x0]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ "ldr d4, [x9, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
- "ldr d5, [x21, #0x0]\n"
+ "ldr d5, [x28, #0x0]\n"
"ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x26\n"
+ "mov v2.d[1], x25\n"
"mov v3.d[1], x24\n"
- "mov v4.d[1], x22\n"
- "mov v5.d[1], x20\n"
- "mov v7.d[1], x11\n"
+ "mov v4.d[1], x23\n"
+ "mov v5.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 187b\n"
"188:" // Height 6: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
"ldr q6, [x16, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
"ldr q7, [x16, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
@@ -3210,98 +3209,98 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"cmp x14, #0x4\n"
"blt 191f\n"
"190:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s6, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s5, [x11], #0x4\n"
+ "ldr s4, [x10], #0x4\n"
+ "ldr s3, [x9], #0x4\n"
+ "ldr s2, [x28], #0x4\n"
+ "ldr q1, [x16, #0x0]\n"
+ ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
+ "ldr q1, [x16, #0x20]\n"
+ ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
+ "ldr q0, [x16, #0x30]\n"
+ ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n"
+ ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n"
"bge 190b\n"
"191:" // Height 6: Multiply loop: Skip odd blocks
"cbz x14, 194f\n"
"tbz x14, #1, 192f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h5, [x21], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h5, [x28], #0x2\n"
"tbz x14, #0, 193f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v5.b }[2], [x21]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
+ "ld1 { v3.b }[2], [x10]\n"
+ "ld1 { v4.b }[2], [x9]\n"
+ "ld1 { v5.b }[2], [x28]\n"
"b 193f\n"
"192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
- "ldr b5, [x21, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
+ "ldr b3, [x10, #0x0]\n"
+ "ldr b4, [x9, #0x0]\n"
+ "ldr b5, [x28, #0x0]\n"
"193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x16, #0x0]\n"
+ ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x16, #0x10]\n"
+ ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x16, #0x20]\n"
+ ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x16, #0x30]\n"
+ ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n"
"194:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -3488,7 +3487,6 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
index ddf776107a..452d647bb4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -77,7 +77,6 @@ void a64_hybrid_s8s32_dot_6x16 (
ka.N = N;
ka.B_ptr = B_ptr;
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 171f\n"
@@ -165,11 +164,11 @@ void a64_hybrid_s8s32_dot_6x16 (
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -186,37 +185,37 @@ void a64_hybrid_s8s32_dot_6x16 (
"blt 18f\n"
"17:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
"cmp x27, #0x20\n"
"add x10, x10, #0x100\n"
@@ -226,37 +225,37 @@ void a64_hybrid_s8s32_dot_6x16 (
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
@@ -264,17 +263,17 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x27, #0x4\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr q16, [x10, #0x0]\n"
+ ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n"
"sub x27, x27, #0x4\n"
- "ldr q7, [x10, #0x10]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
"cmp x27, #0x4\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n"
"add x10, x10, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
@@ -287,14 +286,14 @@ void a64_hybrid_s8s32_dot_6x16 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x26, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
"add x10, x10, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -465,12 +464,12 @@ void a64_hybrid_s8s32_dot_6x16 (
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -478,7 +477,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"b 50f\n"
"49:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"50:" // Height 2: input setup done
"cmp x27, #0x10\n"
"blt 53f\n"
@@ -491,137 +490,137 @@ void a64_hybrid_s8s32_dot_6x16 (
"51:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"sub x27, x27, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"add x25, x25, #0x10\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"cmp x27, #0x20\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"add x26, x26, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n"
+ ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n"
"53:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 58f\n"
"cmp x27, #0x4\n"
"blt 55f\n"
"54:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n"
+ ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n"
+ ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n"
"bge 54b\n"
"55:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 58f\n"
@@ -636,19 +635,19 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr b0, [x26, #0x0]\n"
"ldr b1, [x25, #0x0]\n"
"57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n"
"58:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -866,13 +865,13 @@ void a64_hybrid_s8s32_dot_6x16 (
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 84f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -881,8 +880,8 @@ void a64_hybrid_s8s32_dot_6x16 (
"b 84f\n"
"83:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"84:" // Height 3: input setup done
"cmp x27, #0x10\n"
"blt 87f\n"
@@ -899,75 +898,75 @@ void a64_hybrid_s8s32_dot_6x16 (
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x25, x25, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
"cmp x27, #0x20\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x10, #0x50]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 85b\n"
@@ -977,98 +976,98 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x24, x24, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x10, #0x50]\n"
+ ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n"
"87:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 92f\n"
"cmp x27, #0x4\n"
"blt 89f\n"
"88:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x10, #0x0]\n"
+ ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n"
+ ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n"
+ ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n"
+ ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n"
"bge 88b\n"
"89:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 92f\n"
@@ -1086,23 +1085,23 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr b1, [x25, #0x0]\n"
"ldr b2, [x24, #0x0]\n"
"91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q21, [x10, #0x0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n"
+ ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n"
"92:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1367,14 +1366,14 @@ void a64_hybrid_s8s32_dot_6x16 (
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 118f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1384,9 +1383,9 @@ void a64_hybrid_s8s32_dot_6x16 (
"b 118f\n"
"117:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"118:" // Height 4: input setup done
"cmp x27, #0x10\n"
"blt 121f\n"
@@ -1405,7 +1404,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x26, x26, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1413,85 +1412,85 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x23, x23, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"cmp x27, #0x20\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 119b\n"
@@ -1502,7 +1501,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x25, x25, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1510,112 +1509,112 @@ void a64_hybrid_s8s32_dot_6x16 (
"sub x27, x27, #0x10\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n"
"121:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 126f\n"
"cmp x27, #0x4\n"
"blt 123f\n"
"122:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n"
+ ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n"
"bge 122b\n"
"123:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 126f\n"
@@ -1636,27 +1635,27 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr b2, [x24, #0x0]\n"
"ldr b3, [x23, #0x0]\n"
"125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n"
+ ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n"
"126:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1968,15 +1967,15 @@ void a64_hybrid_s8s32_dot_6x16 (
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 152f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1987,10 +1986,10 @@ void a64_hybrid_s8s32_dot_6x16 (
"b 152f\n"
"151:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"152:" // Height 5: input setup done
"cmp x27, #0x10\n"
"blt 155f\n"
@@ -2013,7 +2012,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x23, x23, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2022,100 +2021,100 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x27, #0x20\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 153b\n"
@@ -2129,7 +2128,7 @@ void a64_hybrid_s8s32_dot_6x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
"add x22, x22, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2138,131 +2137,131 @@ void a64_hybrid_s8s32_dot_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n"
"155:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 160f\n"
"cmp x27, #0x4\n"
"blt 157f\n"
"156:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s0, [x24], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q29, [x10, #0x0]\n"
+ ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n"
+ ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n"
"bge 156b\n"
"157:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 160f\n"
@@ -2286,31 +2285,31 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr b3, [x23, #0x0]\n"
"ldr b4, [x22, #0x0]\n"
"159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q29, [x10, #0x0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n"
+ ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n"
"160:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2672,16 +2671,16 @@ void a64_hybrid_s8s32_dot_6x16 (
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 186f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2693,11 +2692,11 @@ void a64_hybrid_s8s32_dot_6x16 (
"b 186f\n"
"185:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"186:" // Height 6: input setup done
"cmp x27, #0x10\n"
"blt 189f\n"
@@ -2976,43 +2975,43 @@ void a64_hybrid_s8s32_dot_6x16 (
"cmp x27, #0x4\n"
"blt 191f\n"
"190:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n"
+ ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n"
+ "ldr q0, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n"
+ ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n"
+ ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n"
"bge 190b\n"
"191:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 194f\n"
@@ -3039,35 +3038,35 @@ void a64_hybrid_s8s32_dot_6x16 (
"ldr b4, [x22, #0x0]\n"
"ldr b5, [x21, #0x0]\n"
"193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q7, [x10, #0x0]\n"
+ "ldr q6, [x10, #0x10]\n"
+ ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x10, #0x20]\n"
+ ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n"
"194:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3254,7 +3253,6 @@ void a64_hybrid_s8s32_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
index 50ccb6fa3d..4905ba5656 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -92,7 +92,7 @@ public:
case CPUModel::A510:
return { 33.62, 3.92, 0.48 };
case CPUModel::V1:
- return { 86.36, 19.25, 0.92 };
+ return { 63.94, 16.18, 0.83 };
}
}
@@ -109,5 +109,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
index f48623e129..f8a76b5244 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
@@ -77,7 +77,6 @@ void a64_hybrid_s8s32_mmla_6x16 (
ka.N = N;
ka.B_ptr = B_ptr;
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 186f\n"
@@ -178,11 +177,11 @@ void a64_hybrid_s8s32_mmla_6x16 (
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -198,41 +197,41 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "trn1 v19.2d, v1.2d, v20.2d\n"
+ ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x4e92a428 // smmla v8.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x4e91a42c // smmla v12.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"cmp x27, #0x20\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
+ ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
"add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
@@ -240,40 +239,40 @@ void a64_hybrid_s8s32_mmla_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "trn1 v20.2d, v1.2d, v21.2d\n"
+ ".inst 0x4e87a688 // smmla v8.4s, v20.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x4e86a68c // smmla v12.4s, v20.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x4e92a689 // smmla v9.4s, v20.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x4e91a68d // smmla v13.4s, v20.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a68a // smmla v10.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x60]\n"
+ ".inst 0x4e91a68e // smmla v14.4s, v20.16b, v17.16b\n"
+ "ldr q18, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x4e93a68b // smmla v11.4s, v20.16b, v19.16b\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x4e92a68f // smmla v15.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x90]\n"
+ ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x4e93a42c // smmla v12.4s, v1.16b, v19.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
+ ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
@@ -281,26 +280,26 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr d19, [x26], #0x8\n"
+ "ldr q18, [x10, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x4e92a668 // smmla v8.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x4e91a66c // smmla v12.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"bge 21b\n"
"22:" // Height 1: Multiply loop: Skip odd blocks
@@ -325,23 +324,23 @@ void a64_hybrid_s8s32_mmla_6x16 (
"25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x26, #0x0]\n"
"26:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ "ldr q23, [x10, #0x0]\n"
+ "ldr q18, [x10, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v17.2d\n"
+ ".inst 0x4e97a668 // smmla v8.4s, v19.16b, v23.16b\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x4e92a66c // smmla v12.4s, v19.16b, v18.16b\n"
+ "ldr q31, [x10, #0x30]\n"
+ ".inst 0x4e91a669 // smmla v9.4s, v19.16b, v17.16b\n"
+ "ldr q20, [x10, #0x40]\n"
+ ".inst 0x4e9fa66d // smmla v13.4s, v19.16b, v31.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e94a66a // smmla v10.4s, v19.16b, v20.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"27:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -525,12 +524,12 @@ void a64_hybrid_s8s32_mmla_6x16 (
"52:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 53f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 54f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -538,7 +537,7 @@ void a64_hybrid_s8s32_mmla_6x16 (
"b 54f\n"
"53:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"54:" // Height 2: input setup done
"cmp x27, #0x10\n"
"blt 57f\n"
@@ -549,85 +548,85 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 56f\n"
"55:" // Height 2: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x4e92a428 // smmla v8.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x4e91a42c // smmla v12.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"ldr q2, [x25, #0x0]\n"
"cmp x27, #0x20\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
"add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"bge 55b\n"
"56:" // Height 2: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x4e92a428 // smmla v8.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x4e91a42c // smmla v12.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n"
+ ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n"
"sub x27, x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
@@ -637,27 +636,27 @@ void a64_hybrid_s8s32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 59f\n"
"58:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- "ldr q6, [x10, #0x20]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- "ldr q6, [x10, #0x40]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- "ldr q6, [x10, #0x60]\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x10]\n"
+ ".inst 0x4e91a668 // smmla v8.4s, v19.16b, v17.16b\n"
+ ".inst 0x4e96a66c // smmla v12.4s, v19.16b, v22.16b\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x4e81a669 // smmla v9.4s, v19.16b, v1.16b\n"
+ ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ "ldr q17, [x10, #0x70]\n"
"cmp x27, #0x8\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"bge 58b\n"
"59:" // Height 2: Multiply loop: Skip odd blocks
@@ -689,23 +688,23 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr b1, [x26, #0x0]\n"
"ldr b2, [x25, #0x0]\n"
"63:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x4e92a668 // smmla v8.4s, v19.16b, v18.16b\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x4e91a66c // smmla v12.4s, v19.16b, v17.16b\n"
+ "ldr q21, [x10, #0x30]\n"
+ ".inst 0x4e85a669 // smmla v9.4s, v19.16b, v5.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x4e95a66d // smmla v13.4s, v19.16b, v21.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"64:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -953,13 +952,13 @@ void a64_hybrid_s8s32_mmla_6x16 (
"89:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 90f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 91f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -968,8 +967,8 @@ void a64_hybrid_s8s32_mmla_6x16 (
"b 91f\n"
"90:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"91:" // Height 3: input setup done
"cmp x27, #0x10\n"
"blt 94f\n"
@@ -981,167 +980,167 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 93f\n"
"92:" // Height 3: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
"cmp x27, #0x20\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"bge 92b\n"
"93:" // Height 3: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
+ ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n"
+ ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n"
"94:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 101f\n"
"cmp x27, #0x8\n"
"blt 96f\n"
"95:" // Height 3: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr q26, [x10, #0x0]\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
+ ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
"sub x27, x27, #0x8\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
"cmp x27, #0x8\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"bge 95b\n"
"96:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 101f\n"
@@ -1179,33 +1178,33 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr b2, [x25, #0x0]\n"
"ldr b3, [x24, #0x0]\n"
"100:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v25.2d\n"
+ ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e9da78c // smmla v12.4s, v28.16b, v29.16b\n"
+ ".inst 0x4e9da774 // smmla v20.4s, v27.16b, v29.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"101:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1499,14 +1498,14 @@ void a64_hybrid_s8s32_mmla_6x16 (
"126:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 127f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 128f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1516,9 +1515,9 @@ void a64_hybrid_s8s32_mmla_6x16 (
"b 128f\n"
"127:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"128:" // Height 4: input setup done
"cmp x27, #0x10\n"
"blt 131f\n"
@@ -1531,173 +1530,173 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 130f\n"
"129:" // Height 4: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
"add x23, x23, #0x10\n"
"ldr q4, [x23, #0x0]\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
"cmp x27, #0x20\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"bge 129b\n"
"130:" // Height 4: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
"add x23, x23, #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
+ ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n"
+ ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n"
+ ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n"
"131:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 138f\n"
"cmp x27, #0x8\n"
"blt 133f\n"
"132:" // Height 4: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"bge 132b\n"
"133:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 138f\n"
@@ -1742,33 +1741,33 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr b3, [x24, #0x0]\n"
"ldr b4, [x23, #0x0]\n"
"137:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n"
+ ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n"
"138:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2125,15 +2124,15 @@ void a64_hybrid_s8s32_mmla_6x16 (
"163:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 164f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 165f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2144,10 +2143,10 @@ void a64_hybrid_s8s32_mmla_6x16 (
"b 165f\n"
"164:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"165:" // Height 5: input setup done
"cmp x27, #0x10\n"
"blt 168f\n"
@@ -2160,174 +2159,174 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q7, [x10, #0x0]\n"
"blt 167f\n"
"166:" // Height 5: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a454 // smmla v20.4s, v2.16b, v0.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a49c // smmla v28.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x4e87a4c9 // smmla v9.4s, v6.16b, v7.16b\n"
"add x25, x25, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a455 // smmla v21.4s, v2.16b, v0.16b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x4e87a4ca // smmla v10.4s, v6.16b, v7.16b\n"
"cmp x27, #0x20\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a456 // smmla v22.4s, v2.16b, v0.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a49e // smmla v30.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x4e87a4cb // smmla v11.4s, v6.16b, v7.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a457 // smmla v23.4s, v2.16b, v0.16b\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "ldr q6, [x10, #0xa0]\n"
+ ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xb0]\n"
+ ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xc0]\n"
+ ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xd0]\n"
+ ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xe0]\n"
+ ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n"
"ldr q5, [x22, #0x0]\n"
"bge 166b\n"
"167:" // Height 5: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a454 // smmla v20.4s, v2.16b, v0.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a49c // smmla v28.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x4e87a4c9 // smmla v9.4s, v6.16b, v7.16b\n"
"add x24, x24, #0x10\n"
".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
"add x23, x23, #0x10\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a455 // smmla v21.4s, v2.16b, v0.16b\n"
"add x22, x22, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x4e87a4ca // smmla v10.4s, v6.16b, v7.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a456 // smmla v22.4s, v2.16b, v0.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a49e // smmla v30.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x4e87a4cb // smmla v11.4s, v6.16b, v7.16b\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e80a457 // smmla v23.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
+ "ldr q0, [x10, #0xa0]\n"
+ ".inst 0x4e82a42c // smmla v12.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a474 // smmla v20.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4bc // smmla v28.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xb0]\n"
+ ".inst 0x4e80a429 // smmla v9.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xc0]\n"
+ ".inst 0x4e82a42d // smmla v13.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a475 // smmla v21.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4bd // smmla v29.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xd0]\n"
+ ".inst 0x4e80a42a // smmla v10.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4ba // smmla v26.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xe0]\n"
+ ".inst 0x4e82a42e // smmla v14.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a476 // smmla v22.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4be // smmla v30.4s, v5.16b, v2.16b\n"
"ldr q6, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x4e80a42b // smmla v11.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bb // smmla v27.4s, v5.16b, v0.16b\n"
".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
@@ -2337,48 +2336,48 @@ void a64_hybrid_s8s32_mmla_6x16 (
"blt 170f\n"
"169:" // Height 5: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d5, [x22], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
+ "ldr d0, [x22], #0x8\n"
+ "ldr q1, [x10, #0x0]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
+ ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
"cmp x27, #0x8\n"
- ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n"
+ ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x40]\n"
+ ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x4e86a48b // smmla v11.4s, v4.16b, v6.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n"
+ ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a45b // smmla v27.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n"
"bge 169b\n"
"170:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 175f\n"
@@ -2430,42 +2429,42 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr b4, [x23, #0x0]\n"
"ldr b5, [x22, #0x0]\n"
"174:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
+ "ldr q6, [x10, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ "trn1 v2.2d, v5.2d, v0.2d\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x4e86a4e8 // smmla v8.4s, v7.16b, v6.16b\n"
+ ".inst 0x4e86a470 // smmla v16.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x4e81a4ec // smmla v12.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e81a474 // smmla v20.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x4e80a4e9 // smmla v9.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a459 // smmla v25.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x40]\n"
+ ".inst 0x4e81a4ed // smmla v13.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e81a475 // smmla v21.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45d // smmla v29.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x4e80a4ea // smmla v10.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45a // smmla v26.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x60]\n"
+ ".inst 0x4e81a4ee // smmla v14.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e81a476 // smmla v22.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45e // smmla v30.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
+ ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45b // smmla v27.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e86a4ef // smmla v15.4s, v7.16b, v6.16b\n"
+ ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a45f // smmla v31.4s, v2.16b, v6.16b\n"
"175:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2872,16 +2871,16 @@ void a64_hybrid_s8s32_mmla_6x16 (
"200:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 201f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 202f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2893,11 +2892,11 @@ void a64_hybrid_s8s32_mmla_6x16 (
"b 202f\n"
"201:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"202:" // Height 6: input setup done
"cmp x27, #0x10\n"
"blt 205f\n"
@@ -2964,42 +2963,42 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr q2, [x25, #0x0]\n"
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ "ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "ldr q6, [x10, #0xa0]\n"
+ ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xb0]\n"
+ ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xc0]\n"
+ ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xd0]\n"
+ ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xe0]\n"
+ ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n"
"ldr q5, [x22, #0x0]\n"
"ldr q6, [x21, #0x0]\n"
"bge 203b\n"
@@ -3055,35 +3054,35 @@ void a64_hybrid_s8s32_mmla_6x16 (
".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n"
".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n"
".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n"
+ "ldr q0, [x10, #0xa0]\n"
+ ".inst 0x4e82a42c // smmla v12.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a474 // smmla v20.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4bc // smmla v28.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xb0]\n"
+ ".inst 0x4e80a429 // smmla v9.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xc0]\n"
+ ".inst 0x4e82a42d // smmla v13.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a475 // smmla v21.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4bd // smmla v29.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xd0]\n"
+ ".inst 0x4e80a42a // smmla v10.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4ba // smmla v26.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xe0]\n"
+ ".inst 0x4e82a42e // smmla v14.4s, v1.16b, v2.16b\n"
+ ".inst 0x4e82a476 // smmla v22.4s, v3.16b, v2.16b\n"
+ ".inst 0x4e82a4be // smmla v30.4s, v5.16b, v2.16b\n"
"ldr q6, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x4e80a42b // smmla v11.4s, v1.16b, v0.16b\n"
+ ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a4bb // smmla v27.4s, v5.16b, v0.16b\n"
".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n"
@@ -3093,49 +3092,49 @@ void a64_hybrid_s8s32_mmla_6x16 (
"blt 207f\n"
"206:" // Height 6: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x8\n"
- "ldr d5, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n"
+ "ldr d1, [x22], #0x8\n"
+ "ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x40]\n"
+ ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n"
- ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n"
+ ".inst 0x4e86a48b // smmla v11.4s, v4.16b, v6.16b\n"
+ ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a45b // smmla v27.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n"
+ ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n"
"bge 206b\n"
"207:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 212f\n"
@@ -3194,42 +3193,42 @@ void a64_hybrid_s8s32_mmla_6x16 (
"ldr b5, [x22, #0x0]\n"
"ldr b6, [x21, #0x0]\n"
"211:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n"
+ "ldr q0, [x10, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x4e80a4e8 // smmla v8.4s, v7.16b, v0.16b\n"
+ "trn1 v2.2d, v5.2d, v6.2d\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x4e80a470 // smmla v16.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a458 // smmla v24.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x4e81a4ec // smmla v12.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e81a474 // smmla v20.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x4e80a4e9 // smmla v9.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a459 // smmla v25.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x40]\n"
+ ".inst 0x4e81a4ed // smmla v13.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e81a475 // smmla v21.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45d // smmla v29.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x4e80a4ea // smmla v10.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45a // smmla v26.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x60]\n"
+ ".inst 0x4e81a4ee // smmla v14.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e81a476 // smmla v22.4s, v3.16b, v1.16b\n"
+ ".inst 0x4e81a45e // smmla v30.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x70]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n"
- ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n"
+ ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x4e80a45b // smmla v27.4s, v2.16b, v0.16b\n"
+ ".inst 0x4e86a4ef // smmla v15.4s, v7.16b, v6.16b\n"
+ ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e86a45f // smmla v31.4s, v2.16b, v6.16b\n"
"212:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3440,7 +3439,6 @@ void a64_hybrid_s8s32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"224:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index ebc43425b8..14aba00788 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -84,7 +84,7 @@ public:
case CPUModel::A510:
return { 14.81 };
case CPUModel::V1:
- return { 48.36 };
+ return { 44.54 };
}
}
@@ -108,5 +108,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
index b9caf545f1..00d063b426 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
@@ -78,329 +78,328 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 91f\n"
"cmp %x[M], #0x2\n"
"bgt 61f\n"
"beq 31f\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v15.16b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[output_ptr]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
"3:" // Height 1: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "cbnz x12, 6f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
+ "add x9, x9, x20\n"
"b 6f\n"
"5:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
+ "mov x9, %x[input_ptr]\n"
"6:" // Height 1: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 11f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr d21, [x12, #0x70]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d20, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d26, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
- "mov v4.d[1], x9\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr d25, [x12, #0xa0]\n"
+ "mov v21.d[1], x20\n"
+ "ldr x20, [x12, #0x88]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
+ "ldr d24, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d23, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
- "mov v5.d[1], x28\n"
- "ldr x27, [x13, #0x98]\n"
- "mov v6.d[1], x27\n"
- "ldr x26, [x13, #0xa8]\n"
- "mov v7.d[1], x26\n"
- "ldr x25, [x13, #0xb8]\n"
- "mov v8.d[1], x25\n"
- "ldr x24, [x13, #0xc8]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "ldr x20, [x13, #0xd8]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- "ldr x9, [x13, #0xe8]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- "ldr x28, [x13, #0xf8]\n"
- "mov v9.d[1], x24\n"
- "mov v10.d[1], x20\n"
- "add x10, x10, #0x10\n"
- "mov v4.d[1], x9\n"
- "add x13, x13, #0x100\n"
- "mov v5.d[1], x28\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "ldr d22, [x12, #0xd0]\n"
+ ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr d21, [x12, #0xe0]\n"
+ "mov v20.d[1], x20\n"
+ "ldr x20, [x12, #0x98]\n"
+ "mov v26.d[1], x20\n"
+ "ldr x20, [x12, #0xa8]\n"
+ "mov v25.d[1], x20\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "mov v24.d[1], x20\n"
+ "ldr x23, [x12, #0xc8]\n"
+ ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr d20, [x12, #0xf0]\n"
+ ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
+ "ldr x22, [x12, #0xd8]\n"
+ ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
+ "ldr x21, [x12, #0xe8]\n"
+ ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
+ "ldr x20, [x12, #0xf8]\n"
+ "mov v23.d[1], x23\n"
+ "mov v22.d[1], x22\n"
+ "add x9, x9, #0x10\n"
+ "mov v21.d[1], x21\n"
+ "add x12, x12, #0x100\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q4, [x13, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q4, [x12, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q21, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q20, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q26, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q25, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q24, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q23, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "sub x11, x11, #0x10\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- "add x10, x10, #0x10\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "ldr q22, [x12, #0xd0]\n"
+ ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr q21, [x12, #0xe0]\n"
+ ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr q20, [x12, #0xf0]\n"
+ ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
+ "sub x10, x10, #0x10\n"
+ ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 10f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"10:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
"11:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 18f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 18f\n"
+ "cmp x10, #0x4\n"
"blt 14f\n"
"12:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
"tbnz %x[flags], #31, 13f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q22, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q21, [x12, #0x20]\n"
+ ".inst 0x6f80e290 // udot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
+ ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n"
+ ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
- "cbz x11, 18f\n"
- "tbz x11, #1, 15f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 16f\n"
- "ld1 { v0.b }[2], [x10]\n"
+ "cbz x10, 18f\n"
+ "tbz x10, #1, 15f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 16f\n"
+ "ld1 { v0.b }[2], [x9]\n"
"b 16f\n"
"15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
"16:" // Height 1: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 17f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
+ "ldr q20, [x12, #0x0]\n"
+ ".inst 0x6f80e290 // udot v16.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x10]\n"
+ ".inst 0x6f80e291 // udot v17.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x20]\n"
+ ".inst 0x6f80e292 // udot v18.4s, v20.16b, v0.4b[0]\n"
+ "ldr q20, [x12, #0x30]\n"
+ ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 4b\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v1.4s }, [x23]\n"
- "neg v1.4s, v1.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "neg v20.4s, v20.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "mul v11.4s, v11.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v20.4s\n"
"19:" // Height 1: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q23, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q22, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q21, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q20, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v23.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v20.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 20f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v0.16b\n"
+ "and v21.16b, v18.16b, v0.16b\n"
+ "and v20.16b, v19.16b, v0.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"20:" // Height 1: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v20.4s\n"
+ "add v17.4s, v17.4s, v20.4s\n"
+ "add v18.4s, v18.4s, v20.4s\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
- "cmp x15, #0x10\n"
+ "cmp x14, #0x10\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"bge 29f\n"
- "tbz x15, #3, 24f\n"
- "str d16, [x14], #0x8\n"
- "tbz x15, #2, 22f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "tbz x15, #1, 21f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[14], [x14]\n"
+ "tbz x14, #3, 24f\n"
+ "str d16, [x13], #0x8\n"
+ "tbz x14, #2, 22f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "tbz x14, #1, 21f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[14], [x13]\n"
"b 28f\n"
"21:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[12], [x14]\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[12], [x13]\n"
"b 28f\n"
"22:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 23f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[10], [x14]\n"
+ "tbz x14, #1, 23f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[10], [x13]\n"
"b 28f\n"
"23:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[8], [x14]\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[8], [x13]\n"
"b 28f\n"
"24:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 26f\n"
- "str s16, [x14], #0x4\n"
- "tbz x15, #1, 25f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[6], [x14]\n"
+ "tbz x14, #2, 26f\n"
+ "str s16, [x13], #0x4\n"
+ "tbz x14, #1, 25f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[6], [x13]\n"
"b 28f\n"
"25:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[4], [x14]\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[4], [x13]\n"
"b 28f\n"
"26:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 27f\n"
- "str h16, [x14], #0x2\n"
- "tbz x15, #0, 28f\n"
- "st1 { v16.b }[2], [x14]\n"
+ "tbz x14, #1, 27f\n"
+ "str h16, [x13], #0x2\n"
+ "tbz x14, #0, 28f\n"
+ "st1 { v16.b }[2], [x13]\n"
"b 28f\n"
"27:" // Height 1: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
"28:" // Height 1: Partial direct writeback: Done
"b 30f\n"
"29:" // Height 1: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
"30:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 2b\n"
"b 122f\n"
"31:" // Height 2
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v15.16b, #0x1\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -411,307 +410,307 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
"33:" // Height 2: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 35f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "cbnz x12, 36f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x11, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
- "add x23, x23, x20\n"
+ "add x9, x9, x20\n"
+ "add x28, x28, x20\n"
"b 36f\n"
"35:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x23, x10, x20\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
"36:" // Height 2: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 41f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x23, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 39f\n"
"37:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
+ "ldr d25, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "mov v4.d[1], x9\n"
+ "mov v25.d[1], x20\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d24, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d30, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
- "ldr x26, [x13, #0xa8]\n"
+ "ldr d29, [x12, #0xa0]\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
- "ldr x25, [x13, #0xb8]\n"
+ "ldr d28, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d27, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "mov v5.d[1], x28\n"
+ "mov v24.d[1], x23\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "mov v6.d[1], x27\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
- "mov v7.d[1], x26\n"
- "ldr x24, [x13, #0xc8]\n"
- "mov v8.d[1], x25\n"
- "ldr x20, [x13, #0xd8]\n"
- "ldr x9, [x13, #0xe8]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
- "ldr x28, [x13, #0xf8]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- "mov v9.d[1], x24\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- "mov v10.d[1], x20\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- "mov v4.d[1], x9\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- "mov v5.d[1], x28\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- "add x10, x10, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x13, x13, #0x100\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "ldr d26, [x12, #0xd0]\n"
+ ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n"
+ "mov v30.d[1], x22\n"
+ ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr d25, [x12, #0xe0]\n"
+ "mov v29.d[1], x21\n"
+ "ldr x23, [x12, #0xc8]\n"
+ "mov v28.d[1], x20\n"
+ "ldr x22, [x12, #0xd8]\n"
+ "ldr x21, [x12, #0xe8]\n"
+ ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr d24, [x12, #0xf0]\n"
+ "ldr x20, [x12, #0xf8]\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ "mov v27.d[1], x23\n"
+ ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n"
+ "mov v26.d[1], x22\n"
+ ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ "add x28, x28, #0x10\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 38f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"38:" // Height 2: Multiply loop: unique 5: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"bge 37b\n"
"39:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q25, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q24, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q30, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q29, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q28, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q27, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "ldr q26, [x12, #0xd0]\n"
+ ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr q25, [x12, #0xe0]\n"
+ ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr q24, [x12, #0xf0]\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 40f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"40:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
"41:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 48f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 48f\n"
+ "cmp x10, #0x4\n"
"blt 44f\n"
"42:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x23], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
"tbnz %x[flags], #31, 43f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ "ldr q27, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q26, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q25, [x12, #0x20]\n"
+ ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
+ ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n"
+ ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
+ ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"bge 42b\n"
"44:" // Height 2: Multiply loop: Skip odd blocks
- "cbz x11, 48f\n"
- "tbz x11, #1, 45f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "tbz x11, #0, 46f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x23]\n"
+ "cbz x10, 48f\n"
+ "tbz x10, #1, 45f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x10, #0, 46f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
"b 46f\n"
"45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
"46:" // Height 2: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 47f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ "ldr q24, [x12, #0x0]\n"
+ ".inst 0x6f80e310 // udot v16.4s, v24.16b, v0.4b[0]\n"
+ "ldr q26, [x12, #0x10]\n"
+ ".inst 0x6f81e314 // udot v20.4s, v24.16b, v1.4b[0]\n"
+ "ldr q25, [x12, #0x20]\n"
+ ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
+ "ldr q24, [x12, #0x30]\n"
+ ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
+ ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x14, x20\n"
- "prfm pstl1keep, [x14, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "add x23, x13, x20\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x23]\n"
- "neg v2.4s, v2.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "neg v24.4s, v24.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "mul v11.4s, v11.4s, v2.4s\n"
- "mul v12.4s, v12.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v24.4s\n"
+ "mul v12.4s, v12.4s, v24.4s\n"
"49:" // Height 2: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q27, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q26, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q25, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q24, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v27.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v25.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v24.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 50f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
+ "and v30.16b, v17.16b, v0.16b\n"
+ "and v29.16b, v18.16b, v0.16b\n"
+ "and v28.16b, v19.16b, v0.16b\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v0.16b\n"
+ "and v25.16b, v22.16b, v0.16b\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v29.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sqadd v21.4s, v21.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
@@ -721,122 +720,122 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v24.4s\n"
+ "add v18.4s, v18.4s, v24.4s\n"
+ "add v19.4s, v19.4s, v24.4s\n"
+ "add v20.4s, v20.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v24.4s\n"
+ "add v22.4s, v22.4s, v24.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v24.4s\n"
+ "smin v18.4s, v18.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v24.4s\n"
+ "smin v22.4s, v22.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v24.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
- "cmp x15, #0x10\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v17.8h, v22.8h, v23.8h\n"
+ "cmp x14, #0x10\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 59f\n"
- "tbz x15, #3, 54f\n"
- "str d16, [x14], #0x8\n"
- "str d20, [x22], #0x8\n"
- "tbz x15, #2, 52f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "tbz x15, #1, 51f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[14], [x14]\n"
- "st1 { v20.b }[14], [x22]\n"
+ "tbz x14, #3, 54f\n"
+ "str d16, [x13], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x14, #2, 52f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x14, #1, 51f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[14], [x13]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[12], [x14]\n"
- "st1 { v20.b }[12], [x22]\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[12], [x13]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 53f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[10], [x14]\n"
- "st1 { v20.b }[10], [x22]\n"
+ "tbz x14, #1, 53f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[10], [x13]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[8], [x14]\n"
- "st1 { v20.b }[8], [x22]\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[8], [x13]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 56f\n"
- "str s16, [x14], #0x4\n"
- "str s20, [x22], #0x4\n"
- "tbz x15, #1, 55f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[6], [x14]\n"
- "st1 { v20.b }[6], [x22]\n"
+ "tbz x14, #2, 56f\n"
+ "str s16, [x13], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x14, #1, 55f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[6], [x13]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[4], [x14]\n"
- "st1 { v20.b }[4], [x22]\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[4], [x13]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 57f\n"
- "str h16, [x14], #0x2\n"
- "str h20, [x22], #0x2\n"
- "tbz x15, #0, 58f\n"
- "st1 { v16.b }[2], [x14]\n"
- "st1 { v20.b }[2], [x22]\n"
+ "tbz x14, #1, 57f\n"
+ "str h16, [x13], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "tbz x14, #0, 58f\n"
+ "st1 { v16.b }[2], [x13]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
- "str b20, [x22, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q20, [x22, #0x0]\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 32b\n"
"b 122f\n"
"61:" // Height 3
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v15.16b, #0x1\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
"62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
@@ -851,317 +850,317 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
"63:" // Height 3: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 65f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "cbnz x12, 66f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "cbnz x11, 66f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
+ "add x9, x9, x20\n"
+ "add x28, x28, x20\n"
+ "add x27, x27, x20\n"
"b 66f\n"
"65:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x23, x10, x20\n"
- "add x22, x23, x20\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
+ "add x27, x28, x21\n"
"66:" // Height 3: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 71f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x23, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 69f\n"
"67:" // Height 3: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr x20, [x12, #0x78]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr x23, [x12, #0x88]\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
+ "ldr d29, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "mov v4.d[1], x9\n"
+ "mov v29.d[1], x20\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x22, [x12, #0x98]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d28, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr x26, [x13, #0xa8]\n"
+ "ldr x21, [x12, #0xa8]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x25, [x13, #0xb8]\n"
+ "ldr x20, [x12, #0xb8]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d5, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "mov v5.d[1], x28\n"
+ "mov v28.d[1], x23\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "mov v6.d[1], x27\n"
+ "mov v5.d[1], x22\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
+ "ldr d4, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "mov v7.d[1], x26\n"
+ "mov v4.d[1], x21\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr x24, [x13, #0xc8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
+ "ldr d3, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "mov v8.d[1], x25\n"
+ "mov v3.d[1], x20\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr x20, [x13, #0xd8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d31, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr x9, [x13, #0xe8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr x28, [x13, #0xf8]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "mov v9.d[1], x24\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "mov v10.d[1], x20\n"
- ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- "mov v4.d[1], x9\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- "add x10, x10, #0x10\n"
- ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "mov v5.d[1], x28\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "ldr d30, [x12, #0xd0]\n"
+ ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n"
+ "mov v31.d[1], x23\n"
+ ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n"
+ "mov v30.d[1], x22\n"
+ ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr d29, [x12, #0xe0]\n"
+ ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr d28, [x12, #0xf0]\n"
+ ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n"
+ "add x27, x27, #0x10\n"
+ ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 68f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"68:" // Height 3: Multiply loop: unique 9: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"bge 67b\n"
"69:" // Height 3: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q29, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q28, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q5, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q4, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q3, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q31, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "ldr q30, [x12, #0xd0]\n"
+ ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr q29, [x12, #0xe0]\n"
+ ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr q28, [x12, #0xf0]\n"
+ ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n"
+ ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 70f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"70:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
"71:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 78f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 78f\n"
+ "cmp x10, #0x4\n"
"blt 74f\n"
"72:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
"tbnz %x[flags], #31, 73f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q30, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q29, [x12, #0x20]\n"
+ ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
+ ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
+ ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n"
"bge 72b\n"
"74:" // Height 3: Multiply loop: Skip odd blocks
- "cbz x11, 78f\n"
- "tbz x11, #1, 75f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "tbz x11, #0, 76f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "ld1 { v2.b }[2], [x22]\n"
+ "cbz x10, 78f\n"
+ "tbz x10, #1, 75f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "tbz x10, #0, 76f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
"b 76f\n"
"75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
"76:" // Height 3: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 77f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ ".inst 0x6f80e390 // udot v16.4s, v28.16b, v0.4b[0]\n"
+ "ldr q30, [x12, #0x10]\n"
+ ".inst 0x6f81e394 // udot v20.4s, v28.16b, v1.4b[0]\n"
+ "ldr q29, [x12, #0x20]\n"
+ ".inst 0x6f82e398 // udot v24.4s, v28.16b, v2.4b[0]\n"
+ "ldr q28, [x12, #0x30]\n"
+ ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
+ ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n"
"78:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x14, x20\n"
- "add x21, x22, x20\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "add x23, x13, x20\n"
+ "add x22, x23, x20\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x23]\n"
- "neg v3.4s, v3.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "neg v28.4s, v28.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "mul v11.4s, v11.4s, v3.4s\n"
- "mul v12.4s, v12.4s, v3.4s\n"
- "mul v13.4s, v13.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v28.4s\n"
+ "mul v12.4s, v12.4s, v28.4s\n"
+ "mul v13.4s, v13.4s, v28.4s\n"
"79:" // Height 3: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q31, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q30, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q29, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q28, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1171,73 +1170,73 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"add v25.4s, v25.4s, v13.4s\n"
"add v26.4s, v26.4s, v13.4s\n"
"add v27.4s, v27.4s, v13.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v31.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v31.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v28.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 80f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
+ "and v1.16b, v16.16b, v0.16b\n"
+ "and v31.16b, v17.16b, v0.16b\n"
+ "and v30.16b, v18.16b, v0.16b\n"
+ "and v29.16b, v19.16b, v0.16b\n"
+ "and v28.16b, v20.16b, v0.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v29.4s\n"
+ "sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v0.16b\n"
+ "and v29.16b, v26.16b, v0.16b\n"
+ "and v28.16b, v27.16b, v0.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v3.4s\n"
+ "sqadd v22.4s, v22.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v1.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "sqadd v27.4s, v27.4s, v28.4s\n"
"80:" // Height 3: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
@@ -1251,156 +1250,156 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v28.4s\n"
+ "add v18.4s, v18.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v23.4s, v23.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v28.4s\n"
+ "smax v17.4s, v17.4s, v28.4s\n"
+ "smax v18.4s, v18.4s, v28.4s\n"
+ "smax v19.4s, v19.4s, v28.4s\n"
+ "smax v20.4s, v20.4s, v28.4s\n"
+ "smax v21.4s, v21.4s, v28.4s\n"
+ "smax v22.4s, v22.4s, v28.4s\n"
+ "smax v23.4s, v23.4s, v28.4s\n"
+ "smax v24.4s, v24.4s, v28.4s\n"
+ "smax v25.4s, v25.4s, v28.4s\n"
+ "smax v26.4s, v26.4s, v28.4s\n"
+ "smax v27.4s, v27.4s, v28.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "cmp x15, #0x10\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "cmp x14, #0x10\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 89f\n"
- "tbz x15, #3, 84f\n"
- "str d16, [x14], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "tbz x15, #2, 82f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "tbz x15, #1, 81f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[14], [x14]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "tbz x14, #3, 84f\n"
+ "str d16, [x13], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x14, #2, 82f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x14, #1, 81f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[14], [x13]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[12], [x14]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[12], [x13]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 83f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[10], [x14]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "tbz x14, #1, 83f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[10], [x13]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[8], [x14]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[8], [x13]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 86f\n"
- "str s16, [x14], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "tbz x15, #1, 85f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[6], [x14]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "tbz x14, #2, 86f\n"
+ "str s16, [x13], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x14, #1, 85f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[6], [x13]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[4], [x14]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[4], [x13]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 87f\n"
- "str h16, [x14], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "tbz x15, #0, 88f\n"
- "st1 { v16.b }[2], [x14]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "tbz x14, #1, 87f\n"
+ "str h16, [x13], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "tbz x14, #0, 88f\n"
+ "st1 { v16.b }[2], [x13]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 62b\n"
"b 122f\n"
"91:" // Height 4
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
"mov x20, #0x4\n"
- "mov x16, %x[col_bias]\n"
+ "mov x15, %x[col_bias]\n"
"movi v11.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
"movi v14.4s, #0x0\n"
- "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"movi v15.16b, #0x1\n"
- "mov x14, %x[output_ptr]\n"
+ "mov x13, %x[output_ptr]\n"
"madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
"92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
@@ -1420,117 +1419,117 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
"93:" // Height 4: setup done
- "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 95f\n"
- "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x10, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
- "cbnz x12, 96f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x26, [x20, #0x18]\n"
+ "cbnz x11, 96f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x20\n"
- "add x23, x23, x20\n"
- "add x22, x22, x20\n"
- "add x21, x21, x20\n"
+ "add x9, x9, x20\n"
+ "add x28, x28, x20\n"
+ "add x27, x27, x20\n"
+ "add x26, x26, x20\n"
"b 96f\n"
"95:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x23, x10, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "mov x9, %x[input_ptr]\n"
+ "add x28, x9, x21\n"
+ "add x27, x28, x21\n"
+ "add x26, x27, x21\n"
"96:" // Height 4: input setup done
- "cmp x11, #0x10\n"
+ "cmp x10, #0x10\n"
"blt 101f\n"
- "ldr q0, [x10, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q1, [x23, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
"blt 99f\n"
"97:" // Height 4: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr x9, [x13, #0x78]\n"
+ "ldr x22, [x12, #0x78]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x28, [x13, #0x88]\n"
+ "ldr x21, [x12, #0x88]\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x20, [x12, #0x98]\n"
".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr d4, [x13, #0x70]\n"
+ "ldr d4, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "mov v4.d[1], x9\n"
+ "mov v4.d[1], x22\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr x26, [x13, #0xa8]\n"
+ "ldr x25, [x12, #0xa8]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr x25, [x13, #0xb8]\n"
+ "ldr x24, [x12, #0xb8]\n"
".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr d5, [x13, #0x80]\n"
+ "ldr d5, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "mov v5.d[1], x28\n"
+ "mov v5.d[1], x21\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr x24, [x13, #0xc8]\n"
+ "ldr x23, [x12, #0xc8]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr x20, [x13, #0xd8]\n"
+ "ldr x22, [x12, #0xd8]\n"
".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x13, #0x90]\n"
+ "ldr d6, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x27\n"
+ "mov v6.d[1], x20\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr x9, [x13, #0xe8]\n"
+ "ldr x21, [x12, #0xe8]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr x28, [x13, #0xf8]\n"
+ "ldr x20, [x12, #0xf8]\n"
".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x13, #0xa0]\n"
+ "ldr d7, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "mov v7.d[1], x26\n"
+ "mov v7.d[1], x25\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr d8, [x13, #0xb0]\n"
+ "ldr d8, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "mov v8.d[1], x25\n"
+ "mov v8.d[1], x24\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "add x22, x22, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "add x21, x21, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr d9, [x13, #0xc0]\n"
+ "ldr d9, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "mov v9.d[1], x24\n"
+ "mov v9.d[1], x23\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr d10, [x13, #0xd0]\n"
+ "ldr d10, [x12, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "mov v10.d[1], x20\n"
+ "mov v10.d[1], x22\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr d4, [x13, #0xe0]\n"
+ "ldr d4, [x12, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- "mov v4.d[1], x9\n"
+ "mov v4.d[1], x21\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr d5, [x13, #0xf0]\n"
+ "ldr d5, [x12, #0xf0]\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "mov v5.d[1], x28\n"
+ "mov v5.d[1], x20\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- "add x13, x13, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
@@ -1563,77 +1562,77 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"98:" // Height 4: Multiply loop: unique 13: skip row sum
- "ldr q0, [x10, #0x0]\n"
- "sub x11, x11, #0x10\n"
- "ldr q1, [x23, #0x0]\n"
- "cmp x11, #0x20\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q4, [x13, #0x0]\n"
- "ldr q5, [x13, #0x10]\n"
- "ldr q6, [x13, #0x20]\n"
- "ldr q7, [x13, #0x30]\n"
- "ldr q8, [x13, #0x40]\n"
- "ldr q9, [x13, #0x50]\n"
- "ldr q10, [x13, #0x60]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x26, #0x0]\n"
+ "ldr q4, [x12, #0x0]\n"
+ "ldr q5, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q8, [x12, #0x40]\n"
+ "ldr q9, [x12, #0x50]\n"
+ "ldr q10, [x12, #0x60]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"bge 97b\n"
"99:" // Height 4: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "sub x10, x10, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q4, [x13, #0x70]\n"
+ "ldr q4, [x12, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "add x27, x27, #0x10\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x26, x26, #0x10\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x13, #0x80]\n"
+ "ldr q5, [x12, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x13, #0x90]\n"
+ "ldr q6, [x12, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x13, #0xa0]\n"
+ "ldr q7, [x12, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x13, #0xb0]\n"
+ "ldr q8, [x12, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x13, #0xc0]\n"
+ "ldr q9, [x12, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x13, #0xd0]\n"
+ "ldr q10, [x12, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x13, #0xe0]\n"
+ "ldr q4, [x12, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x13, #0xf0]\n"
+ "ldr q5, [x12, #0xf0]\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x13, x13, #0x100\n"
+ "add x12, x12, #0x100\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1667,67 +1666,67 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"100:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"101:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 108f\n"
- "cmp x11, #0x4\n"
+ "cbz x10, 108f\n"
+ "cmp x10, #0x4\n"
"blt 104f\n"
"102:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s3, [x21], #0x4\n"
+ "ldr s0, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
"tbnz %x[flags], #31, 103f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q6, [x13, #0x0]\n"
- "sub x11, x11, #0x4\n"
- "ldr q7, [x13, #0x10]\n"
- "cmp x11, #0x4\n"
- "ldr q8, [x13, #0x20]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q9, [x13, #0x30]\n"
- ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n"
+ "ldr q7, [x12, #0x0]\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x12, #0x10]\n"
+ "cmp x10, #0x4\n"
+ "ldr q5, [x12, #0x20]\n"
+ ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
+ ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
"bge 102b\n"
"104:" // Height 4: Multiply loop: Skip odd blocks
- "cbz x11, 108f\n"
- "tbz x11, #1, 105f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h3, [x21], #0x2\n"
- "tbz x11, #0, 106f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x23]\n"
- "ld1 { v2.b }[2], [x22]\n"
- "ld1 { v3.b }[2], [x21]\n"
+ "cbz x10, 108f\n"
+ "tbz x10, #1, 105f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v3.b }[2], [x26]\n"
"b 106f\n"
"105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x23, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
- "ldr b3, [x21, #0x0]\n"
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x27, #0x0]\n"
+ "ldr b3, [x26, #0x0]\n"
"106:" // Height 4: Multiply loop: Ragged operand read: Done
"tbnz %x[flags], #31, 107f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
@@ -1735,64 +1734,64 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q10, [x13, #0x0]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- "ldr q4, [x13, #0x10]\n"
- ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x13, #0x20]\n"
- ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
- "ldr q6, [x13, #0x30]\n"
- ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- "add x13, x13, #0x40\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n"
+ "ldr q7, [x12, #0x0]\n"
+ ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x12, #0x10]\n"
+ ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x12, #0x20]\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ "ldr q4, [x12, #0x30]\n"
+ ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
"108:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x20\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x14, x20\n"
+ "add x23, x13, x20\n"
+ "add x22, x23, x20\n"
"add x21, x22, x20\n"
- "add x20, x21, x20\n"
- "prfm pstl1keep, [x14, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "neg v4.4s, v4.4s\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "neg v0.4s, v0.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "mul v12.4s, v12.4s, v4.4s\n"
- "mul v13.4s, v13.4s, v4.4s\n"
- "mul v14.4s, v14.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v0.4s\n"
+ "mul v12.4s, v12.4s, v0.4s\n"
+ "mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"109:" // Height 4: skip row sum fixup
- "ldr q0, [x16, #0x0]\n"
+ "ldr q3, [x15, #0x0]\n"
"add v16.4s, v16.4s, v11.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr q2, [x15, #0x10]\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q1, [x15, #0x20]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q0, [x15, #0x30]\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
@@ -1806,93 +1805,93 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"add v29.4s, v29.4s, v14.4s\n"
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v14.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v29.4s, v29.4s, v1.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v2.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v2.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v2.4s\n"
+ "add v30.4s, v30.4s, v1.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v1.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "add x16, x16, #0x40\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "add x15, x15, #0x40\n"
"tbz %x[flags], #5, 110f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v2.16b, v16.16b, v0.16b\n"
+ "and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
+ "and v7.16b, v18.16b, v0.16b\n"
+ "and v6.16b, v19.16b, v0.16b\n"
+ "and v5.16b, v20.16b, v0.16b\n"
+ "and v4.16b, v21.16b, v0.16b\n"
+ "and v3.16b, v22.16b, v0.16b\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "and v9.16b, v28.16b, v0.16b\n"
- "and v10.16b, v29.16b, v0.16b\n"
- "and v4.16b, v30.16b, v0.16b\n"
- "and v5.16b, v31.16b, v0.16b\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "and v7.16b, v25.16b, v0.16b\n"
+ "and v6.16b, v26.16b, v0.16b\n"
+ "and v5.16b, v27.16b, v0.16b\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "and v3.16b, v29.16b, v0.16b\n"
+ "and v2.16b, v30.16b, v0.16b\n"
+ "and v1.16b, v31.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
- "sqadd v28.4s, v28.4s, v9.4s\n"
- "sqadd v29.4s, v29.4s, v10.4s\n"
- "sqadd v30.4s, v30.4s, v4.4s\n"
- "sqadd v31.4s, v31.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v5.4s\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v3.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
"110:" // Height 4: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
@@ -1910,172 +1909,172 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"srshl v29.4s, v29.4s, v0.4s\n"
"srshl v30.4s, v30.4s, v0.4s\n"
"srshl v31.4s, v31.4s, v0.4s\n"
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v0.4s\n"
+ "add v18.4s, v18.4s, v0.4s\n"
+ "add v19.4s, v19.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v0.4s\n"
+ "add v30.4s, v30.4s, v0.4s\n"
+ "add v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "smax v16.4s, v16.4s, v0.4s\n"
+ "smax v17.4s, v17.4s, v0.4s\n"
+ "smax v18.4s, v18.4s, v0.4s\n"
+ "smax v19.4s, v19.4s, v0.4s\n"
+ "smax v20.4s, v20.4s, v0.4s\n"
+ "smax v21.4s, v21.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v0.4s\n"
+ "smax v23.4s, v23.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v0.4s\n"
+ "smax v25.4s, v25.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v0.4s\n"
+ "smax v27.4s, v27.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v0.4s\n"
+ "smax v29.4s, v29.4s, v0.4s\n"
+ "smax v30.4s, v30.4s, v0.4s\n"
+ "smax v31.4s, v31.4s, v0.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v0.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v19.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v29.8h, v30.8h, v31.8h\n"
- "cmp x15, #0x10\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
- "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "uzp1 v17.8h, v30.8h, v31.8h\n"
+ "cmp x14, #0x10\n"
+ "uzp1 v16.16b, v16.16b, v0.16b\n"
+ "uzp1 v20.16b, v20.16b, v19.16b\n"
+ "uzp1 v24.16b, v24.16b, v18.16b\n"
+ "uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 119f\n"
- "tbz x15, #3, 114f\n"
- "str d16, [x14], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
- "tbz x15, #2, 112f\n"
- "st1 { v16.s }[2], [x14], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "st1 { v28.s }[2], [x20], #0x4\n"
- "tbz x15, #1, 111f\n"
- "st1 { v16.h }[6], [x14], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "st1 { v28.h }[6], [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[14], [x14]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
- "st1 { v28.b }[14], [x20]\n"
+ "tbz x14, #3, 114f\n"
+ "str d16, [x13], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x14, #2, 112f\n"
+ "st1 { v16.s }[2], [x13], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x14, #1, 111f\n"
+ "st1 { v16.h }[6], [x13], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[14], [x13]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[12], [x14]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
- "st1 { v28.b }[12], [x20]\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[12], [x13]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 113f\n"
- "st1 { v16.h }[4], [x14], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "st1 { v28.h }[4], [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[10], [x14]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
- "st1 { v28.b }[10], [x20]\n"
+ "tbz x14, #1, 113f\n"
+ "st1 { v16.h }[4], [x13], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[10], [x13]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[8], [x14]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
- "st1 { v28.b }[8], [x20]\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[8], [x13]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 116f\n"
- "str s16, [x14], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "str s28, [x20], #0x4\n"
- "tbz x15, #1, 115f\n"
- "st1 { v16.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "st1 { v28.h }[2], [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[6], [x14]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
- "st1 { v28.b }[6], [x20]\n"
+ "tbz x14, #2, 116f\n"
+ "str s16, [x13], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x14, #1, 115f\n"
+ "st1 { v16.h }[2], [x13], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[6], [x13]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[4], [x14]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
- "st1 { v28.b }[4], [x20]\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[4], [x13]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 117f\n"
- "str h16, [x14], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "str h28, [x20], #0x2\n"
- "tbz x15, #0, 118f\n"
- "st1 { v16.b }[2], [x14]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
- "st1 { v28.b }[2], [x20]\n"
+ "tbz x14, #1, 117f\n"
+ "str h16, [x13], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x14, #0, 118f\n"
+ "st1 { v16.b }[2], [x13]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
- "str b16, [x14, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
- "str b28, [x20, #0x0]\n"
+ "str b16, [x13, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
- "str q16, [x14, #0x0]\n"
- "add x14, x14, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
- "str q28, [x20, #0x0]\n"
+ "str q16, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
+ "subs x14, x14, #0x10\n"
"bgt 92b\n"
"subs %x[M], %x[M], #0x4\n"
"beq 122f\n"
@@ -2089,10 +2088,9 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
index 31fbf88603..ebe583b5d4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -78,7 +78,6 @@ void a64_hybrid_u8qa_dot_4x16 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 91f\n"
@@ -102,11 +101,11 @@ void a64_hybrid_u8qa_dot_4x16 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -128,32 +127,32 @@ void a64_hybrid_u8qa_dot_4x16 (
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q21, [x28, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q20, [x28, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q26, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q25, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q24, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q23, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q22, [x28, #0xd0]\n"
+ ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr q21, [x28, #0xe0]\n"
+ ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr q20, [x28, #0xf0]\n"
+ ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
"add x28, x28, #0x100\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"8:" // Height 1: Multiply loop: unique 1: skip row sum
@@ -171,33 +170,33 @@ void a64_hybrid_u8qa_dot_4x16 (
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q21, [x28, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q20, [x28, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q26, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q25, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q24, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q23, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q22, [x28, #0xd0]\n"
+ ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n"
+ "ldr q21, [x28, #0xe0]\n"
+ ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n"
+ "ldr q20, [x28, #0xf0]\n"
+ ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n"
"sub x25, x25, #0x10\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n"
"add x24, x24, #0x10\n"
"add x28, x28, #0x100\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n"
"tbnz %x[flags], #31, 10f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"10:" // Height 1: Multiply loop: unique 2: skip row sum
@@ -211,16 +210,16 @@ void a64_hybrid_u8qa_dot_4x16 (
"tbnz %x[flags], #31, 13f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q23, [x28, #0x0]\n"
+ "ldr q22, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q21, [x28, #0x20]\n"
+ "ldr q20, [x28, #0x30]\n"
+ ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n"
+ ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n"
+ ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
"add x28, x28, #0x40\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
@@ -236,14 +235,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"tbnz %x[flags], #31, 17f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"17:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ ".inst 0x6f80e2b0 // udot v16.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f80e291 // udot v17.4s, v20.16b, v0.4b[0]\n"
+ "ldr q21, [x28, #0x20]\n"
+ "ldr q20, [x28, #0x30]\n"
+ ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n"
"add x28, x28, #0x40\n"
"18:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -252,72 +251,72 @@ void a64_hybrid_u8qa_dot_4x16 (
"bne 4b\n"
"prfm pstl1keep, [x27, #0x0]\n"
"tbnz %x[flags], #31, 19f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v1.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v20.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "neg v1.4s, v1.4s\n"
+ "neg v20.4s, v20.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "mul v11.4s, v11.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v20.4s\n"
"19:" // Height 1: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q23, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q22, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v20.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v23.4s\n"
+ "add v18.4s, v18.4s, v22.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v19.4s, v19.4s, v21.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v20.4s\n"
"add x10, x10, #0x40\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
"tbz %x[flags], #5, 20f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "and v22.16b, v17.16b, v0.16b\n"
+ "and v21.16b, v18.16b, v0.16b\n"
+ "and v20.16b, v19.16b, v0.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v20.4s\n"
"20:" // Height 1: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v22.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "add v16.4s, v16.4s, v22.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v18.4s, v18.4s, v22.4s\n"
+ "add v19.4s, v19.4s, v22.4s\n"
"cmp x9, #0x10\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "smin v16.4s, v16.4s, v21.4s\n"
+ "smin v17.4s, v17.4s, v21.4s\n"
+ "smin v18.4s, v18.4s, v21.4s\n"
+ "smin v19.4s, v19.4s, v21.4s\n"
+ "smax v16.4s, v16.4s, v20.4s\n"
+ "smax v17.4s, v17.4s, v20.4s\n"
+ "smax v18.4s, v18.4s, v20.4s\n"
+ "smax v19.4s, v19.4s, v20.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
@@ -397,12 +396,12 @@ void a64_hybrid_u8qa_dot_4x16 (
"34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 35f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -410,7 +409,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"b 36f\n"
"35:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"36:" // Height 2: input setup done
"cmp x25, #0x10\n"
"blt 41f\n"
@@ -428,48 +427,48 @@ void a64_hybrid_u8qa_dot_4x16 (
"37:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
"add x24, x24, #0x10\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"add x23, x23, #0x10\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr q24, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 38f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
@@ -491,49 +490,49 @@ void a64_hybrid_u8qa_dot_4x16 (
"39:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
"sub x25, x25, #0x10\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"add x24, x24, #0x10\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
"add x23, x23, #0x10\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n"
+ "ldr q24, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n"
"tbnz %x[flags], #31, 40f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
@@ -551,21 +550,21 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"43:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q27, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n"
+ ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n"
+ ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
+ ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"bge 42b\n"
"44:" // Height 2: Multiply loop: Skip odd blocks
"cbz x25, 48f\n"
@@ -584,209 +583,209 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
"47:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x6f80e310 // udot v16.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e314 // udot v20.4s, v24.16b, v1.4b[0]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n"
+ ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n"
+ ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n"
"48:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 34b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x27, x20\n"
+ "add x23, x27, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 49f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "neg v2.4s, v2.4s\n"
+ "neg v24.4s, v24.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
- "mul v11.4s, v11.4s, v2.4s\n"
- "mul v12.4s, v12.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v24.4s\n"
+ "mul v12.4s, v12.4s, v24.4s\n"
"49:" // Height 2: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q27, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q26, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v24.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "add v17.4s, v17.4s, v27.4s\n"
"add x10, x10, #0x40\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "add v20.4s, v20.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v25.4s\n"
+ "add v20.4s, v20.4s, v28.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v21.4s, v21.4s, v27.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v23.4s, v23.4s, v25.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v24.4s\n"
"tbz %x[flags], #5, 50f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v24.16b, v16.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v24.4s\n"
+ "and v30.16b, v17.16b, v0.16b\n"
+ "and v29.16b, v18.16b, v0.16b\n"
+ "and v28.16b, v19.16b, v0.16b\n"
+ "and v27.16b, v20.16b, v0.16b\n"
+ "and v26.16b, v21.16b, v0.16b\n"
+ "and v25.16b, v22.16b, v0.16b\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v29.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sqadd v21.4s, v21.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
"50:" // Height 2: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"cmp x9, #0x10\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v26.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v26.4s\n"
+ "add v20.4s, v20.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v23.4s, v23.4s, v26.4s\n"
+ "smin v16.4s, v16.4s, v25.4s\n"
+ "smin v17.4s, v17.4s, v25.4s\n"
+ "smin v18.4s, v18.4s, v25.4s\n"
+ "smin v19.4s, v19.4s, v25.4s\n"
+ "smin v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v25.4s\n"
+ "smin v22.4s, v22.4s, v25.4s\n"
+ "smin v23.4s, v23.4s, v25.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v17.8h, v22.8h, v23.8h\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v20.16b, v20.16b, v17.16b\n"
"bge 59f\n"
"tbz x9, #3, 54f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x22], #0x8\n"
+ "str d20, [x23], #0x8\n"
"tbz x9, #2, 52f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
"tbz x9, #1, 51f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x23]\n"
"b 58f\n"
"51:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 58f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x23]\n"
"b 58f\n"
"52:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 53f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x23]\n"
"b 58f\n"
"53:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 58f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x23]\n"
"b 58f\n"
"54:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 56f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x22], #0x4\n"
+ "str s20, [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x23]\n"
"b 58f\n"
"55:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 58f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x23]\n"
"b 58f\n"
"56:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 57f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x22], #0x2\n"
+ "str h20, [x23], #0x2\n"
"tbz x9, #0, 58f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x23]\n"
"b 58f\n"
"57:" // Height 2: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x22, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
"58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
"59:" // Height 2: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x22, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
"60:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 32b\n"
@@ -819,13 +818,13 @@ void a64_hybrid_u8qa_dot_4x16 (
"64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 65f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 66f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -834,8 +833,8 @@ void a64_hybrid_u8qa_dot_4x16 (
"b 66f\n"
"65:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"66:" // Height 3: input setup done
"cmp x25, #0x10\n"
"blt 71f\n"
@@ -857,62 +856,62 @@ void a64_hybrid_u8qa_dot_4x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q29, [x28, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
"add x22, x22, #0x10\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q28, [x28, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q5, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q4, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q3, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q31, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q30, [x28, #0xd0]\n"
+ ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr q29, [x28, #0xe0]\n"
+ ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr q28, [x28, #0xf0]\n"
+ ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n"
"add x28, x28, #0x100\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n"
+ ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 68f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
@@ -940,63 +939,63 @@ void a64_hybrid_u8qa_dot_4x16 (
"sub x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q4, [x28, #0x70]\n"
+ "ldr q29, [x28, #0x70]\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
"add x23, x23, #0x10\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q5, [x28, #0x80]\n"
+ "ldr q28, [x28, #0x80]\n"
"add x22, x22, #0x10\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x28, #0x90]\n"
+ "ldr q5, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x28, #0xa0]\n"
+ "ldr q4, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x28, #0xb0]\n"
+ "ldr q3, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x28, #0xc0]\n"
+ "ldr q31, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x28, #0xd0]\n"
- ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x28, #0xe0]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x28, #0xf0]\n"
- ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q30, [x28, #0xd0]\n"
+ ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n"
+ "ldr q29, [x28, #0xe0]\n"
+ ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n"
+ "ldr q28, [x28, #0xf0]\n"
+ ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n"
"add x28, x28, #0x100\n"
- ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
- ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
- ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n"
+ ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n"
+ ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n"
+ ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n"
+ ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n"
+ ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n"
+ ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n"
"tbnz %x[flags], #31, 70f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
@@ -1018,25 +1017,25 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"73:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q31, [x28, #0x0]\n"
+ "ldr q30, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q28, [x28, #0x30]\n"
+ ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
+ ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
+ ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n"
"bge 72b\n"
"74:" // Height 3: Multiply loop: Skip odd blocks
"cbz x25, 78f\n"
@@ -1059,144 +1058,144 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
+ "ldr q31, [x28, #0x0]\n"
+ "ldr q30, [x28, #0x10]\n"
+ ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q28, [x28, #0x30]\n"
+ ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n"
+ ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n"
"78:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 64b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 79f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v28.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "neg v3.4s, v3.4s\n"
+ "neg v28.4s, v28.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "mul v11.4s, v11.4s, v3.4s\n"
- "mul v12.4s, v12.4s, v3.4s\n"
- "mul v13.4s, v13.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v28.4s\n"
+ "mul v12.4s, v12.4s, v28.4s\n"
+ "mul v13.4s, v13.4s, v28.4s\n"
"79:" // Height 3: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q31, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q30, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v28.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add x10, x10, #0x40\n"
"add v26.4s, v26.4s, v13.4s\n"
"add v27.4s, v27.4s, v13.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v31.4s\n"
+ "add v18.4s, v18.4s, v30.4s\n"
+ "add v19.4s, v19.4s, v29.4s\n"
"add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v31.4s\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v31.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
"tbz %x[flags], #5, 80f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
+ "and v1.16b, v16.16b, v0.16b\n"
+ "and v31.16b, v17.16b, v0.16b\n"
+ "and v30.16b, v18.16b, v0.16b\n"
+ "and v29.16b, v19.16b, v0.16b\n"
+ "and v28.16b, v20.16b, v0.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v29.4s\n"
+ "sqadd v20.4s, v20.4s, v28.4s\n"
+ "and v3.16b, v21.16b, v0.16b\n"
+ "and v2.16b, v22.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
+ "and v31.16b, v24.16b, v0.16b\n"
+ "and v30.16b, v25.16b, v0.16b\n"
+ "and v29.16b, v26.16b, v0.16b\n"
+ "and v28.16b, v27.16b, v0.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v3.4s\n"
+ "sqadd v22.4s, v22.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v1.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "sqadd v27.4s, v27.4s, v28.4s\n"
"80:" // Height 3: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1204,132 +1203,132 @@ void a64_hybrid_u8qa_dot_4x16 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v30.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v30.4s\n"
+ "add v19.4s, v19.4s, v30.4s\n"
+ "add v20.4s, v20.4s, v30.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add v23.4s, v23.4s, v30.4s\n"
+ "add v24.4s, v24.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v30.4s\n"
+ "smin v16.4s, v16.4s, v29.4s\n"
+ "smin v17.4s, v17.4s, v29.4s\n"
+ "smin v18.4s, v18.4s, v29.4s\n"
+ "smin v19.4s, v19.4s, v29.4s\n"
+ "smin v20.4s, v20.4s, v29.4s\n"
+ "smin v21.4s, v21.4s, v29.4s\n"
+ "smin v22.4s, v22.4s, v29.4s\n"
+ "smin v23.4s, v23.4s, v29.4s\n"
+ "smin v24.4s, v24.4s, v29.4s\n"
+ "smin v25.4s, v25.4s, v29.4s\n"
+ "smin v26.4s, v26.4s, v29.4s\n"
+ "smin v27.4s, v27.4s, v29.4s\n"
+ "smax v16.4s, v16.4s, v28.4s\n"
+ "smax v17.4s, v17.4s, v28.4s\n"
+ "smax v18.4s, v18.4s, v28.4s\n"
+ "smax v19.4s, v19.4s, v28.4s\n"
+ "smax v20.4s, v20.4s, v28.4s\n"
+ "smax v21.4s, v21.4s, v28.4s\n"
+ "smax v22.4s, v22.4s, v28.4s\n"
+ "smax v23.4s, v23.4s, v28.4s\n"
+ "smax v24.4s, v24.4s, v28.4s\n"
+ "smax v25.4s, v25.4s, v28.4s\n"
+ "smax v26.4s, v26.4s, v28.4s\n"
+ "smax v27.4s, v27.4s, v28.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v18.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 89f\n"
"tbz x9, #3, 84f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 82f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 81f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 88f\n"
"81:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 88f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 88f\n"
"82:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 83f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 88f\n"
"83:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 88f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 88f\n"
"84:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 86f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 85f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 88f\n"
"85:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 88f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 88f\n"
"86:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 87f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 88f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 88f\n"
"87:" // Height 3: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"88:" // Height 3: Partial direct writeback: Done
"b 90f\n"
"89:" // Height 3: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"90:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 62b\n"
@@ -1370,14 +1369,14 @@ void a64_hybrid_u8qa_dot_4x16 (
"94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 95f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 96f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1387,9 +1386,9 @@ void a64_hybrid_u8qa_dot_4x16 (
"b 96f\n"
"95:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"96:" // Height 4: input setup done
"cmp x25, #0x10\n"
"blt 101f\n"
@@ -1614,29 +1613,29 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"103:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q6, [x28, #0x0]\n"
- "ldr q7, [x28, #0x10]\n"
+ "ldr q7, [x28, #0x0]\n"
+ "ldr q6, [x28, #0x10]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ldr q8, [x28, #0x20]\n"
- "ldr q9, [x28, #0x30]\n"
- ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ "ldr q4, [x28, #0x30]\n"
+ ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
"bge 102b\n"
"104:" // Height 4: Multiply loop: Skip odd blocks
"cbz x25, 108f\n"
@@ -1663,73 +1662,73 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
"107:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q7, [x28, #0x0]\n"
+ "ldr q6, [x28, #0x10]\n"
+ ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
"ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ "ldr q4, [x28, #0x30]\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
"add x28, x28, #0x40\n"
- ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
"108:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 94b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "add x20, x21, x20\n"
+ "add x21, x22, x20\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "prfm pstl1keep, [x20, #0x0]\n"
"tbnz %x[flags], #31, 109f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "neg v4.4s, v4.4s\n"
+ "neg v0.4s, v0.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "mul v12.4s, v12.4s, v4.4s\n"
- "mul v13.4s, v13.4s, v4.4s\n"
- "mul v14.4s, v14.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v0.4s\n"
+ "mul v12.4s, v12.4s, v0.4s\n"
+ "mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"109:" // Height 4: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q4, [x10, #0x10]\n"
"add v16.4s, v16.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
"add v20.4s, v20.4s, v12.4s\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add v22.4s, v22.4s, v12.4s\n"
"add v23.4s, v23.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add x10, x10, #0x40\n"
@@ -1740,100 +1739,100 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v30.4s, v30.4s, v14.4s\n"
"add v31.4s, v31.4s, v14.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v2.4s\n"
"add v20.4s, v20.4s, v0.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v22.4s, v22.4s, v2.4s\n"
- "add v23.4s, v23.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v2.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v2.4s\n"
"add v28.4s, v28.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v29.4s, v29.4s, v1.4s\n"
- "add v30.4s, v30.4s, v2.4s\n"
- "add v31.4s, v31.4s, v3.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v2.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
"tbz %x[flags], #5, 110f\n"
- "and v4.16b, v16.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "and v8.16b, v20.16b, v0.16b\n"
- "and v9.16b, v21.16b, v0.16b\n"
- "and v10.16b, v22.16b, v0.16b\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v2.16b, v16.16b, v0.16b\n"
+ "and v1.16b, v17.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v2.4s\n"
+ "sqadd v17.4s, v17.4s, v1.4s\n"
+ "and v7.16b, v18.16b, v0.16b\n"
+ "and v6.16b, v19.16b, v0.16b\n"
+ "and v5.16b, v20.16b, v0.16b\n"
+ "and v4.16b, v21.16b, v0.16b\n"
+ "and v3.16b, v22.16b, v0.16b\n"
+ "and v2.16b, v23.16b, v0.16b\n"
+ "and v1.16b, v24.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
- "sqadd v20.4s, v20.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "and v9.16b, v28.16b, v0.16b\n"
- "and v10.16b, v29.16b, v0.16b\n"
- "and v4.16b, v30.16b, v0.16b\n"
- "and v5.16b, v31.16b, v0.16b\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "and v7.16b, v25.16b, v0.16b\n"
+ "and v6.16b, v26.16b, v0.16b\n"
+ "and v5.16b, v27.16b, v0.16b\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "and v3.16b, v29.16b, v0.16b\n"
+ "and v2.16b, v30.16b, v0.16b\n"
+ "and v1.16b, v31.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
- "sqadd v28.4s, v28.4s, v9.4s\n"
- "sqadd v29.4s, v29.4s, v10.4s\n"
- "sqadd v30.4s, v30.4s, v4.4s\n"
- "sqadd v31.4s, v31.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v5.4s\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v3.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
"110:" // Height 4: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v1.4s }, [x20]\n"
"srshl v22.4s, v22.4s, v0.4s\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1845,163 +1844,163 @@ void a64_hybrid_u8qa_dot_4x16 (
"srshl v29.4s, v29.4s, v0.4s\n"
"srshl v30.4s, v30.4s, v0.4s\n"
"srshl v31.4s, v31.4s, v0.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "smin v16.4s, v16.4s, v2.4s\n"
+ "smin v17.4s, v17.4s, v2.4s\n"
+ "smin v18.4s, v18.4s, v2.4s\n"
+ "smin v19.4s, v19.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v2.4s\n"
+ "smin v21.4s, v21.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v2.4s\n"
+ "smin v23.4s, v23.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v2.4s\n"
+ "smin v25.4s, v25.4s, v2.4s\n"
+ "smin v26.4s, v26.4s, v2.4s\n"
+ "smin v27.4s, v27.4s, v2.4s\n"
+ "smin v28.4s, v28.4s, v2.4s\n"
+ "smin v29.4s, v29.4s, v2.4s\n"
+ "smin v30.4s, v30.4s, v2.4s\n"
+ "smin v31.4s, v31.4s, v2.4s\n"
+ "smax v16.4s, v16.4s, v1.4s\n"
+ "smax v17.4s, v17.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v1.4s\n"
+ "smax v19.4s, v19.4s, v1.4s\n"
+ "smax v20.4s, v20.4s, v1.4s\n"
+ "smax v21.4s, v21.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v1.4s\n"
+ "smax v23.4s, v23.4s, v1.4s\n"
+ "smax v24.4s, v24.4s, v1.4s\n"
+ "smax v25.4s, v25.4s, v1.4s\n"
+ "smax v26.4s, v26.4s, v1.4s\n"
+ "smax v27.4s, v27.4s, v1.4s\n"
+ "smax v28.4s, v28.4s, v1.4s\n"
+ "smax v29.4s, v29.4s, v1.4s\n"
+ "smax v30.4s, v30.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v1.4s\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v0.8h, v18.8h, v19.8h\n"
"uzp1 v20.8h, v20.8h, v21.8h\n"
- "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v19.8h, v22.8h, v23.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v18.8h, v26.8h, v27.8h\n"
"uzp1 v28.8h, v28.8h, v29.8h\n"
- "uzp1 v29.8h, v30.8h, v31.8h\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v20.16b, v20.16b, v21.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
- "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "uzp1 v17.8h, v30.8h, v31.8h\n"
+ "uzp1 v16.16b, v16.16b, v0.16b\n"
+ "uzp1 v20.16b, v20.16b, v19.16b\n"
+ "uzp1 v24.16b, v24.16b, v18.16b\n"
+ "uzp1 v28.16b, v28.16b, v17.16b\n"
"bge 119f\n"
"tbz x9, #3, 114f\n"
"str d16, [x27], #0x8\n"
- "str d20, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
- "str d28, [x20], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
"tbz x9, #2, 112f\n"
"st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
- "st1 { v28.s }[2], [x20], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
"tbz x9, #1, 111f\n"
"st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
- "st1 { v28.h }[6], [x20], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
- "st1 { v28.b }[14], [x20]\n"
+ "st1 { v20.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
+ "st1 { v28.b }[14], [x21]\n"
"b 118f\n"
"111:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 118f\n"
"st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
- "st1 { v28.b }[12], [x20]\n"
+ "st1 { v20.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
+ "st1 { v28.b }[12], [x21]\n"
"b 118f\n"
"112:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 113f\n"
"st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
- "st1 { v28.h }[4], [x20], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
- "st1 { v28.b }[10], [x20]\n"
+ "st1 { v20.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
+ "st1 { v28.b }[10], [x21]\n"
"b 118f\n"
"113:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 118f\n"
"st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
- "st1 { v28.b }[8], [x20]\n"
+ "st1 { v20.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
+ "st1 { v28.b }[8], [x21]\n"
"b 118f\n"
"114:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 116f\n"
"str s16, [x27], #0x4\n"
- "str s20, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
- "str s28, [x20], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
"tbz x9, #1, 115f\n"
"st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
- "st1 { v28.h }[2], [x20], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
- "st1 { v28.b }[6], [x20]\n"
+ "st1 { v20.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
+ "st1 { v28.b }[6], [x21]\n"
"b 118f\n"
"115:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 118f\n"
"st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
- "st1 { v28.b }[4], [x20]\n"
+ "st1 { v20.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
+ "st1 { v28.b }[4], [x21]\n"
"b 118f\n"
"116:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 117f\n"
"str h16, [x27], #0x2\n"
- "str h20, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
- "str h28, [x20], #0x2\n"
+ "str h20, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
+ "str h28, [x21], #0x2\n"
"tbz x9, #0, 118f\n"
"st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
- "st1 { v28.b }[2], [x20]\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
+ "st1 { v28.b }[2], [x21]\n"
"b 118f\n"
"117:" // Height 4: Partial direct writeback: partial_1_0
"str b16, [x27, #0x0]\n"
- "str b20, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
- "str b28, [x20, #0x0]\n"
+ "str b20, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
"118:" // Height 4: Partial direct writeback: Done
"b 120f\n"
"119:" // Height 4: Full writeback
"str q16, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q20, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
- "str q28, [x20, #0x0]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
"120:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 92b\n"
@@ -2017,7 +2016,6 @@ void a64_hybrid_u8qa_dot_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"122:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
index 8a47701a4a..17e7405a0a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -81,7 +81,7 @@ public:
case CPUModel::A510:
return { 28.00 };
case CPUModel::V1:
- return { 68.98 };
+ return { 62.26 };
}
}
@@ -98,5 +98,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
index f808cb199d..1335b355ef 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
@@ -78,7 +78,6 @@ void a64_hybrid_u8qa_mmla_4x16 (
flags |= 0x20;
}
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x4\n"
"bge 97f\n"
@@ -106,11 +105,11 @@ void a64_hybrid_u8qa_mmla_4x16 (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -131,35 +130,35 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr q4, [x28, #0x60]\n"
"blt 9f\n"
"7:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v0.2d, v1.2d, v27.2d\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
+ "ldr q25, [x28, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v27.2d\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
- ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
+ ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
"add x28, x28, #0x100\n"
- ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
+ ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
+ ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 8f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
@@ -177,36 +176,36 @@ void a64_hybrid_u8qa_mmla_4x16 (
"prfm pldl1keep, [x24, #0x80]\n"
"bge 7b\n"
"9:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v0.2d, v1.2d, v24.2d\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
+ "ldr q25, [x28, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v24.2d\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
"sub x25, x25, #0x10\n"
- ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
+ ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
"add x24, x24, #0x10\n"
"add x28, x28, #0x100\n"
- ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
+ ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
+ ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 10f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
@@ -217,29 +216,29 @@ void a64_hybrid_u8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 14f\n"
"12:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x24], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "trn1 v0.2d, v25.2d, v24.2d\n"
"tbnz %x[flags], #31, 13f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"13:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
- ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e9aa414 // ummla v20.4s, v0.16b, v26.16b\n"
+ "ldr q27, [x28, #0x40]\n"
+ "ldr q26, [x28, #0x50]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n"
+ ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n"
+ ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"bge 12b\n"
"14:" // Height 1: Multiply loop: Skip odd blocks
@@ -264,26 +263,26 @@ void a64_hybrid_u8qa_mmla_4x16 (
"17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x24, #0x0]\n"
"18:" // Height 1: Multiply loop: Ragged operand read: Done
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v0.2d, v1.2d, v24.2d\n"
"tbnz %x[flags], #31, 19f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"19:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
- ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x6e99a410 // ummla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a414 // ummla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x6e99a412 // ummla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a416 // ummla v22.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"20:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -297,75 +296,75 @@ void a64_hybrid_u8qa_mmla_4x16 (
"uzp1 v19.2d, v19.2d, v23.2d\n"
"mov v23.16b, v16.16b\n"
"tbnz %x[flags], #31, 21f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v1.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "neg v1.4s, v1.4s\n"
+ "neg v16.4s, v16.4s\n"
"dup v11.4s, v11.s[0]\n"
- "mul v11.4s, v11.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v16.4s\n"
"21:" // Height 1: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x10]\n"
"add v23.4s, v23.4s, v11.4s\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q21, [x10, #0x20]\n"
+ "ldr q20, [x10, #0x30]\n"
"add v18.4s, v18.4s, v11.4s\n"
"add v19.4s, v19.4s, v11.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v16.4s }, [x20]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v24.4s\n"
+ "add v17.4s, v17.4s, v22.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v19.4s, v19.4s, v20.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v16.4s\n"
"add x10, x10, #0x40\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v16.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
"tbz %x[flags], #5, 22f\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "and v5.16b, v17.16b, v0.16b\n"
- "and v6.16b, v18.16b, v0.16b\n"
- "and v7.16b, v19.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "sqadd v18.4s, v18.4s, v6.4s\n"
- "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v22.16b, v23.16b, v0.16b\n"
+ "and v21.16b, v17.16b, v0.16b\n"
+ "and v20.16b, v18.16b, v0.16b\n"
+ "and v16.16b, v19.16b, v0.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v22.4s\n"
+ "sqadd v17.4s, v17.4s, v21.4s\n"
+ "sqadd v18.4s, v18.4s, v20.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
"22:" // Height 1: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v21.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "add v23.4s, v23.4s, v21.4s\n"
+ "add v17.4s, v17.4s, v21.4s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v19.4s, v19.4s, v21.4s\n"
"cmp x9, #0x10\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "smax v23.4s, v23.4s, v16.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
"uzp1 v23.8h, v23.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
- "uzp1 v23.16b, v23.16b, v17.16b\n"
+ "uzp1 v16.8h, v18.8h, v19.8h\n"
+ "uzp1 v23.16b, v23.16b, v16.16b\n"
"bge 31f\n"
"tbz x9, #3, 26f\n"
"str d23, [x27], #0x8\n"
@@ -442,12 +441,12 @@ void a64_hybrid_u8qa_mmla_4x16 (
"36:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 37f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 38f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -455,7 +454,7 @@ void a64_hybrid_u8qa_mmla_4x16 (
"b 38f\n"
"37:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"38:" // Height 2: input setup done
"cmp x25, #0x10\n"
"blt 43f\n"
@@ -473,34 +472,34 @@ void a64_hybrid_u8qa_mmla_4x16 (
"39:" // Height 2: Multiply loop: Main loop head
"trn1 v0.2d, v1.2d, v2.2d\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
- ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
+ ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n"
+ ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
"add x28, x28, #0x100\n"
- ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n"
- ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
+ ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 40f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
@@ -522,35 +521,35 @@ void a64_hybrid_u8qa_mmla_4x16 (
"41:" // Height 2: Multiply loop: Single iteration only
"trn1 v0.2d, v1.2d, v2.2d\n"
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q25, [x28, #0x70]\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q24, [x28, #0x80]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
- "ldr q7, [x28, #0x90]\n"
+ "ldr q30, [x28, #0x90]\n"
".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n"
- "ldr q8, [x28, #0xa0]\n"
+ "ldr q29, [x28, #0xa0]\n"
".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n"
- "ldr q9, [x28, #0xb0]\n"
+ "ldr q28, [x28, #0xb0]\n"
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
- "ldr q10, [x28, #0xc0]\n"
+ "ldr q27, [x28, #0xc0]\n"
".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- "ldr q5, [x28, #0xe0]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ "ldr q26, [x28, #0xd0]\n"
+ ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n"
+ "ldr q25, [x28, #0xe0]\n"
+ ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n"
+ "ldr q24, [x28, #0xf0]\n"
"sub x25, x25, #0x10\n"
- ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n"
- ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n"
+ ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n"
+ ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n"
- ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n"
+ ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n"
+ ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n"
"add x28, x28, #0x100\n"
- ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n"
"tbnz %x[flags], #31, 42f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n"
@@ -562,30 +561,30 @@ void a64_hybrid_u8qa_mmla_4x16 (
"cmp x25, #0x8\n"
"blt 46f\n"
"44:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "trn1 v0.2d, v25.2d, v24.2d\n"
"tbnz %x[flags], #31, 45f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"45:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n"
+ "ldr q24, [x28, #0x0]\n"
+ "ldr q26, [x28, #0x10]\n"
+ ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n"
"sub x25, x25, #0x8\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
"cmp x25, #0x8\n"
- ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n"
- "ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e9aa414 // ummla v20.4s, v0.16b, v26.16b\n"
+ "ldr q27, [x28, #0x40]\n"
+ "ldr q26, [x28, #0x50]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n"
+ ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n"
+ ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"bge 44b\n"
"46:" // Height 2: Multiply loop: Skip odd blocks
@@ -621,22 +620,22 @@ void a64_hybrid_u8qa_mmla_4x16 (
"tbnz %x[flags], #31, 51f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
"51:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n"
- "ldr q5, [x28, #0x20]\n"
- "ldr q6, [x28, #0x30]\n"
- ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
- ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n"
+ "ldr q25, [x28, #0x0]\n"
+ "ldr q24, [x28, #0x10]\n"
+ ".inst 0x6e99a410 // ummla v16.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a414 // ummla v20.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x20]\n"
+ "ldr q24, [x28, #0x30]\n"
+ ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x40]\n"
+ "ldr q24, [x28, #0x50]\n"
+ ".inst 0x6e99a412 // ummla v18.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a416 // ummla v22.4s, v0.16b, v24.16b\n"
+ "ldr q25, [x28, #0x60]\n"
+ "ldr q24, [x28, #0x70]\n"
+ ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n"
+ ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n"
"add x28, x28, #0x80\n"
"52:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -644,127 +643,127 @@ void a64_hybrid_u8qa_mmla_4x16 (
"cmp x26, x20\n"
"bne 36b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v4.2d, v16.2d, v20.2d\n"
- "add x22, x27, x20\n"
+ "uzp1 v24.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
- "mov v23.16b, v4.16b\n"
+ "mov v23.16b, v24.16b\n"
"tbnz %x[flags], #31, 53f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v24.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "neg v2.4s, v2.4s\n"
+ "neg v24.4s, v24.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
- "mul v11.4s, v11.4s, v2.4s\n"
- "mul v12.4s, v12.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v24.4s\n"
+ "mul v12.4s, v12.4s, v24.4s\n"
"53:" // Height 2: skip row sum fixup
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q28, [x10, #0x0]\n"
+ "ldr q27, [x10, #0x10]\n"
"add v23.4s, v23.4s, v11.4s\n"
"add v20.4s, v20.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q26, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x30]\n"
"add v21.4s, v21.4s, v11.4s\n"
"add v22.4s, v22.4s, v11.4s\n"
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v24.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v23.4s, v23.4s, v28.4s\n"
+ "add v20.4s, v20.4s, v27.4s\n"
"add x10, x10, #0x40\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v16.4s, v16.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v25.4s\n"
+ "add v16.4s, v16.4s, v28.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v17.4s, v17.4s, v27.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v25.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
"tbz %x[flags], #5, 54f\n"
- "and v4.16b, v23.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v4.4s\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "and v6.16b, v21.16b, v0.16b\n"
- "and v7.16b, v22.16b, v0.16b\n"
- "and v8.16b, v16.16b, v0.16b\n"
- "and v9.16b, v17.16b, v0.16b\n"
- "and v10.16b, v18.16b, v0.16b\n"
- "and v4.16b, v19.16b, v0.16b\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "sqadd v16.4s, v16.4s, v8.4s\n"
- "sqadd v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
+ "and v24.16b, v23.16b, v0.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v24.4s\n"
+ "and v30.16b, v20.16b, v0.16b\n"
+ "and v29.16b, v21.16b, v0.16b\n"
+ "and v28.16b, v22.16b, v0.16b\n"
+ "and v27.16b, v16.16b, v0.16b\n"
+ "and v26.16b, v17.16b, v0.16b\n"
+ "and v25.16b, v18.16b, v0.16b\n"
+ "and v24.16b, v19.16b, v0.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "sqadd v22.4s, v22.4s, v28.4s\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "sqadd v18.4s, v18.4s, v25.4s\n"
+ "sqadd v19.4s, v19.4s, v24.4s\n"
"54:" // Height 2: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v26.4s }, [x20]\n"
"srshl v23.4s, v23.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v25.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v24.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"cmp x9, #0x10\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
+ "add v23.4s, v23.4s, v26.4s\n"
+ "add v20.4s, v20.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v26.4s\n"
+ "add v22.4s, v22.4s, v26.4s\n"
+ "add v16.4s, v16.4s, v26.4s\n"
+ "add v17.4s, v17.4s, v26.4s\n"
+ "add v18.4s, v18.4s, v26.4s\n"
+ "add v19.4s, v19.4s, v26.4s\n"
+ "smin v23.4s, v23.4s, v25.4s\n"
+ "smin v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v25.4s\n"
+ "smin v22.4s, v22.4s, v25.4s\n"
+ "smin v16.4s, v16.4s, v25.4s\n"
+ "smin v17.4s, v17.4s, v25.4s\n"
+ "smin v18.4s, v18.4s, v25.4s\n"
+ "smin v19.4s, v19.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smax v22.4s, v22.4s, v24.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smax v18.4s, v18.4s, v24.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
"uzp1 v23.8h, v23.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
@@ -774,68 +773,68 @@ void a64_hybrid_u8qa_mmla_4x16 (
"bge 63f\n"
"tbz x9, #3, 58f\n"
"str d23, [x27], #0x8\n"
- "str d16, [x22], #0x8\n"
+ "str d16, [x23], #0x8\n"
"tbz x9, #2, 56f\n"
"st1 { v23.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
"tbz x9, #1, 55f\n"
"st1 { v23.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x22]\n"
+ "st1 { v16.b }[14], [x23]\n"
"b 62f\n"
"55:" // Height 2: Partial direct writeback: partial_1_12
"tbz x9, #0, 62f\n"
"st1 { v23.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x22]\n"
+ "st1 { v16.b }[12], [x23]\n"
"b 62f\n"
"56:" // Height 2: Partial direct writeback: partial_2_8
"tbz x9, #1, 57f\n"
"st1 { v23.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x22]\n"
+ "st1 { v16.b }[10], [x23]\n"
"b 62f\n"
"57:" // Height 2: Partial direct writeback: partial_1_8
"tbz x9, #0, 62f\n"
"st1 { v23.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x22]\n"
+ "st1 { v16.b }[8], [x23]\n"
"b 62f\n"
"58:" // Height 2: Partial direct writeback: partial_4_0
"tbz x9, #2, 60f\n"
"str s23, [x27], #0x4\n"
- "str s16, [x22], #0x4\n"
+ "str s16, [x23], #0x4\n"
"tbz x9, #1, 59f\n"
"st1 { v23.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x22]\n"
+ "st1 { v16.b }[6], [x23]\n"
"b 62f\n"
"59:" // Height 2: Partial direct writeback: partial_1_4
"tbz x9, #0, 62f\n"
"st1 { v23.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x22]\n"
+ "st1 { v16.b }[4], [x23]\n"
"b 62f\n"
"60:" // Height 2: Partial direct writeback: partial_2_0
"tbz x9, #1, 61f\n"
"str h23, [x27], #0x2\n"
- "str h16, [x22], #0x2\n"
+ "str h16, [x23], #0x2\n"
"tbz x9, #0, 62f\n"
"st1 { v23.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x22]\n"
+ "st1 { v16.b }[2], [x23]\n"
"b 62f\n"
"61:" // Height 2: Partial direct writeback: partial_1_0
"str b23, [x27, #0x0]\n"
- "str b16, [x22, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
"62:" // Height 2: Partial direct writeback: Done
"b 64f\n"
"63:" // Height 2: Full writeback
"str q23, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x22, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
"64:" // Height 2: Writeback done
"subs x9, x9, #0x10\n"
"bgt 34b\n"
@@ -872,13 +871,13 @@ void a64_hybrid_u8qa_mmla_4x16 (
"68:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 69f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -887,8 +886,8 @@ void a64_hybrid_u8qa_mmla_4x16 (
"b 70f\n"
"69:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"70:" // Height 3: input setup done
"cmp x25, #0x10\n"
"blt 75f\n"
@@ -909,12 +908,12 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q4, [x28, #0x60]\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q4, [x28, #0x80]\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
@@ -930,15 +929,15 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n"
".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
- ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e85a413 // ummla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a45b // ummla v27.4s, v2.16b, v5.16b\n"
+ "ldr q6, [x28, #0xd0]\n"
+ ".inst 0x6e8ea417 // ummla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x6e8ea45f // ummla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n"
+ "ldr q4, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n"
".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n"
@@ -948,12 +947,12 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n"
".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n"
".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e86a436 // ummla v22.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e84a437 // ummla v23.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n"
"tbnz %x[flags], #31, 72f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
@@ -981,12 +980,12 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- "ldr q5, [x28, #0x70]\n"
+ "ldr q14, [x28, #0x70]\n"
".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- "ldr q4, [x28, #0x60]\n"
+ "ldr q5, [x28, #0x60]\n"
".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n"
- "ldr q6, [x28, #0x80]\n"
+ "ldr q4, [x28, #0x80]\n"
".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q7, [x28, #0x90]\n"
@@ -1003,15 +1002,15 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n"
"ldr q10, [x28, #0xc0]\n"
"add x22, x22, #0x10\n"
- ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n"
- "ldr q4, [x28, #0xd0]\n"
- ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n"
- ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e85a413 // ummla v19.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a45b // ummla v27.4s, v2.16b, v5.16b\n"
+ "ldr q6, [x28, #0xd0]\n"
+ ".inst 0x6e8ea417 // ummla v23.4s, v0.16b, v14.16b\n"
+ ".inst 0x6e8ea45f // ummla v31.4s, v2.16b, v14.16b\n"
"ldr q5, [x28, #0xe0]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n"
- "ldr q6, [x28, #0xf0]\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n"
+ "ldr q4, [x28, #0xf0]\n"
"add x28, x28, #0x100\n"
".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n"
".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n"
@@ -1021,12 +1020,12 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n"
".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n"
".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n"
- ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n"
- ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e86a436 // ummla v22.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
- ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e84a437 // ummla v23.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n"
"tbnz %x[flags], #31, 74f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
@@ -1042,41 +1041,41 @@ void a64_hybrid_u8qa_mmla_4x16 (
"blt 78f\n"
"76:" // Height 3: Multiply loop: Odd block loop
"ldr d1, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x22], #0x8\n"
- "trn1 v2.2d, v3.2d, v7.2d\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x22], #0x8\n"
+ "trn1 v2.2d, v1.2d, v2.2d\n"
"tbnz %x[flags], #31, 77f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"77:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x6e83a410 // ummla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n"
+ "ldr q7, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
"sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n"
- ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n"
- ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a414 // ummla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
"add x28, x28, #0x80\n"
".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n"
- ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n"
+ ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n"
"bge 76b\n"
"78:" // Height 3: Multiply loop: Skip odd blocks
"cbz x25, 84f\n"
@@ -1115,52 +1114,52 @@ void a64_hybrid_u8qa_mmla_4x16 (
"ldr b3, [x22, #0x0]\n"
"82:" // Height 3: Multiply loop: Ragged operand read: Done
"trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v9.2d\n"
+ "trn1 v2.2d, v3.2d, v4.2d\n"
"tbnz %x[flags], #31, 83f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"83:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n"
- "ldr q5, [x28, #0x20]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
- ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
+ ".inst 0x6e83a414 // ummla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45c // ummla v28.4s, v2.16b, v3.16b\n"
+ "ldr q5, [x28, #0x40]\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a411 // ummla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
"add x28, x28, #0x80\n"
- ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n"
- ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n"
- ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n"
- ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n"
+ ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n"
"84:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 68b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v4.2d, v16.2d, v20.2d\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x22, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
- "prfm pstl1keep, [x21, #0x0]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
@@ -1168,116 +1167,116 @@ void a64_hybrid_u8qa_mmla_4x16 (
"uzp1 v25.2d, v25.2d, v29.2d\n"
"uzp1 v26.2d, v26.2d, v30.2d\n"
"uzp1 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v4.16b\n"
+ "mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 85f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "neg v3.4s, v3.4s\n"
+ "neg v23.4s, v23.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
"dup v13.4s, v13.s[0]\n"
- "mul v11.4s, v11.4s, v3.4s\n"
- "mul v12.4s, v12.4s, v3.4s\n"
- "mul v13.4s, v13.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v23.4s\n"
+ "mul v12.4s, v12.4s, v23.4s\n"
+ "mul v13.4s, v13.4s, v23.4s\n"
"85:" // Height 3: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q30, [x10, #0x10]\n"
"add v31.4s, v31.4s, v11.4s\n"
"add v20.4s, v20.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q29, [x10, #0x20]\n"
+ "ldr q28, [x10, #0x30]\n"
"add v21.4s, v21.4s, v11.4s\n"
"add v22.4s, v22.4s, v11.4s\n"
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v23.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add x10, x10, #0x40\n"
"add v26.4s, v26.4s, v13.4s\n"
"add v27.4s, v27.4s, v13.4s\n"
"add v31.4s, v31.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v30.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v28.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v30.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v28.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v23.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v23.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v23.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v23.4s\n"
"tbz %x[flags], #5, 86f\n"
- "and v4.16b, v31.16b, v0.16b\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "and v6.16b, v21.16b, v0.16b\n"
- "and v7.16b, v22.16b, v0.16b\n"
- "and v8.16b, v16.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "sqadd v16.4s, v16.4s, v8.4s\n"
- "and v9.16b, v17.16b, v0.16b\n"
- "and v10.16b, v18.16b, v0.16b\n"
- "and v4.16b, v19.16b, v0.16b\n"
- "and v5.16b, v24.16b, v0.16b\n"
- "and v6.16b, v25.16b, v0.16b\n"
- "and v7.16b, v26.16b, v0.16b\n"
- "and v8.16b, v27.16b, v0.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sqadd v24.4s, v24.4s, v5.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "sqadd v26.4s, v26.4s, v7.4s\n"
- "sqadd v27.4s, v27.4s, v8.4s\n"
+ "and v1.16b, v31.16b, v0.16b\n"
+ "and v30.16b, v20.16b, v0.16b\n"
+ "and v29.16b, v21.16b, v0.16b\n"
+ "and v28.16b, v22.16b, v0.16b\n"
+ "and v23.16b, v16.16b, v0.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "sqadd v20.4s, v20.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "sqadd v22.4s, v22.4s, v28.4s\n"
+ "sqadd v16.4s, v16.4s, v23.4s\n"
+ "and v3.16b, v17.16b, v0.16b\n"
+ "and v2.16b, v18.16b, v0.16b\n"
+ "and v1.16b, v19.16b, v0.16b\n"
+ "and v30.16b, v24.16b, v0.16b\n"
+ "and v29.16b, v25.16b, v0.16b\n"
+ "and v28.16b, v26.16b, v0.16b\n"
+ "and v23.16b, v27.16b, v0.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "sqadd v24.4s, v24.4s, v30.4s\n"
+ "sqadd v25.4s, v25.4s, v29.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v27.4s, v27.4s, v23.4s\n"
"86:" // Height 3: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v28.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v23.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1285,132 +1284,132 @@ void a64_hybrid_u8qa_mmla_4x16 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v31.4s, v31.4s, v29.4s\n"
+ "add v20.4s, v20.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v16.4s, v16.4s, v29.4s\n"
+ "add v17.4s, v17.4s, v29.4s\n"
+ "add v18.4s, v18.4s, v29.4s\n"
+ "add v19.4s, v19.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "smin v31.4s, v31.4s, v28.4s\n"
+ "smin v20.4s, v20.4s, v28.4s\n"
+ "smin v21.4s, v21.4s, v28.4s\n"
+ "smin v22.4s, v22.4s, v28.4s\n"
+ "smin v16.4s, v16.4s, v28.4s\n"
+ "smin v17.4s, v17.4s, v28.4s\n"
+ "smin v18.4s, v18.4s, v28.4s\n"
+ "smin v19.4s, v19.4s, v28.4s\n"
+ "smin v24.4s, v24.4s, v28.4s\n"
+ "smin v25.4s, v25.4s, v28.4s\n"
+ "smin v26.4s, v26.4s, v28.4s\n"
+ "smin v27.4s, v27.4s, v28.4s\n"
+ "smax v31.4s, v31.4s, v23.4s\n"
+ "smax v20.4s, v20.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v23.4s\n"
+ "smax v22.4s, v22.4s, v23.4s\n"
+ "smax v16.4s, v16.4s, v23.4s\n"
+ "smax v17.4s, v17.4s, v23.4s\n"
+ "smax v18.4s, v18.4s, v23.4s\n"
+ "smax v19.4s, v19.4s, v23.4s\n"
+ "smax v24.4s, v24.4s, v23.4s\n"
+ "smax v25.4s, v25.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v23.4s\n"
+ "smax v27.4s, v27.4s, v23.4s\n"
"uzp1 v31.8h, v31.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v18.8h, v18.8h, v19.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
"uzp1 v31.16b, v31.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v16.16b, v16.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 95f\n"
"tbz x9, #3, 90f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d24, [x21], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
"tbz x9, #2, 88f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
- "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
"tbz x9, #1, 87f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
- "st1 { v24.h }[6], [x21], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v24.h }[6], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x22]\n"
- "st1 { v24.b }[14], [x21]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v24.b }[14], [x22]\n"
"b 94f\n"
"87:" // Height 3: Partial direct writeback: partial_1_12
"tbz x9, #0, 94f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x22]\n"
- "st1 { v24.b }[12], [x21]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v24.b }[12], [x22]\n"
"b 94f\n"
"88:" // Height 3: Partial direct writeback: partial_2_8
"tbz x9, #1, 89f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
- "st1 { v24.h }[4], [x21], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v24.h }[4], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x22]\n"
- "st1 { v24.b }[10], [x21]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v24.b }[10], [x22]\n"
"b 94f\n"
"89:" // Height 3: Partial direct writeback: partial_1_8
"tbz x9, #0, 94f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x22]\n"
- "st1 { v24.b }[8], [x21]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v24.b }[8], [x22]\n"
"b 94f\n"
"90:" // Height 3: Partial direct writeback: partial_4_0
"tbz x9, #2, 92f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x22], #0x4\n"
- "str s24, [x21], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
"tbz x9, #1, 91f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
- "st1 { v24.h }[2], [x21], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v24.h }[2], [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x22]\n"
- "st1 { v24.b }[6], [x21]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v24.b }[6], [x22]\n"
"b 94f\n"
"91:" // Height 3: Partial direct writeback: partial_1_4
"tbz x9, #0, 94f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x22]\n"
- "st1 { v24.b }[4], [x21]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v24.b }[4], [x22]\n"
"b 94f\n"
"92:" // Height 3: Partial direct writeback: partial_2_0
"tbz x9, #1, 93f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x22], #0x2\n"
- "str h24, [x21], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h24, [x22], #0x2\n"
"tbz x9, #0, 94f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x22]\n"
- "st1 { v24.b }[2], [x21]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v24.b }[2], [x22]\n"
"b 94f\n"
"93:" // Height 3: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x22, #0x0]\n"
- "str b24, [x21, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b24, [x22, #0x0]\n"
"94:" // Height 3: Partial direct writeback: Done
"b 96f\n"
"95:" // Height 3: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x22, #0x0]\n"
- "str q24, [x21, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q24, [x22, #0x0]\n"
"96:" // Height 3: Writeback done
"subs x9, x9, #0x10\n"
"bgt 66b\n"
@@ -1451,14 +1450,14 @@ void a64_hybrid_u8qa_mmla_4x16 (
"100:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 101f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 102f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1468,9 +1467,9 @@ void a64_hybrid_u8qa_mmla_4x16 (
"b 102f\n"
"101:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"102:" // Height 4: input setup done
"cmp x25, #0x10\n"
"blt 107f\n"
@@ -1630,42 +1629,42 @@ void a64_hybrid_u8qa_mmla_4x16 (
"blt 110f\n"
"108:" // Height 4: Multiply loop: Odd block loop
"ldr d1, [x24], #0x8\n"
- "ldr d2, [x23], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v2.2d, v3.2d, v7.2d\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v0.2d, v1.2d, v0.2d\n"
+ "ldr d2, [x22], #0x8\n"
+ "ldr d1, [x21], #0x8\n"
+ "trn1 v2.2d, v2.2d, v1.2d\n"
"tbnz %x[flags], #31, 109f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"109:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q8, [x28, #0x0]\n"
- "ldr q9, [x28, #0x10]\n"
- ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n"
- "ldr q10, [x28, #0x20]\n"
- "ldr q4, [x28, #0x30]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q1, [x28, #0x10]\n"
+ ".inst 0x6e83a410 // ummla v16.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n"
+ "ldr q7, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
"sub x25, x25, #0x8\n"
"cmp x25, #0x8\n"
"ldr q5, [x28, #0x40]\n"
- "ldr q6, [x28, #0x50]\n"
- ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n"
- ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n"
- "ldr q7, [x28, #0x60]\n"
- "ldr q8, [x28, #0x70]\n"
- ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n"
- ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n"
- ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a414 // ummla v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
+ ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
"add x28, x28, #0x80\n"
".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
- ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n"
- ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n"
+ ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n"
"bge 108b\n"
"110:" // Height 4: Multiply loop: Skip odd blocks
"cbz x25, 116f\n"
@@ -1716,51 +1715,51 @@ void a64_hybrid_u8qa_mmla_4x16 (
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
"115:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q10, [x28, #0x0]\n"
- "ldr q4, [x28, #0x10]\n"
- ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n"
- ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n"
- "ldr q5, [x28, #0x20]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q3, [x28, #0x10]\n"
+ ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x28, #0x20]\n"
"ldr q6, [x28, #0x30]\n"
- ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n"
- "ldr q7, [x28, #0x40]\n"
- "ldr q8, [x28, #0x50]\n"
- ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n"
- ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
- "ldr q9, [x28, #0x60]\n"
- "ldr q10, [x28, #0x70]\n"
+ ".inst 0x6e83a414 // ummla v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45c // ummla v28.4s, v2.16b, v3.16b\n"
+ "ldr q5, [x28, #0x40]\n"
+ "ldr q4, [x28, #0x50]\n"
+ ".inst 0x6e81a411 // ummla v17.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n"
+ "ldr q3, [x28, #0x60]\n"
+ "ldr q1, [x28, #0x70]\n"
".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n"
".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n"
"add x28, x28, #0x80\n"
- ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n"
- ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n"
- ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n"
- ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n"
- ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n"
- ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n"
+ ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n"
"116:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 100b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 v4.2d, v16.2d, v20.2d\n"
- "add x22, x27, x20\n"
+ "uzp1 v0.2d, v16.2d, v20.2d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"add x21, x22, x20\n"
- "add x20, x21, x20\n"
"uzp2 v16.2d, v16.2d, v20.2d\n"
"uzp1 v20.2d, v17.2d, v21.2d\n"
"prfm pstl1keep, [x27, #0x0]\n"
"uzp2 v17.2d, v17.2d, v21.2d\n"
"uzp1 v21.2d, v18.2d, v22.2d\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"prfm pstl1keep, [x22, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v18.2d, v18.2d, v22.2d\n"
"uzp1 v22.2d, v19.2d, v23.2d\n"
- "prfm pstl1keep, [x20, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"uzp2 v19.2d, v19.2d, v23.2d\n"
"uzp1 v23.2d, v24.2d, v28.2d\n"
"uzp2 v24.2d, v24.2d, v28.2d\n"
@@ -1770,38 +1769,38 @@ void a64_hybrid_u8qa_mmla_4x16 (
"uzp2 v26.2d, v26.2d, v30.2d\n"
"uzp1 v30.2d, v27.2d, v31.2d\n"
"uzp2 v27.2d, v27.2d, v31.2d\n"
- "mov v31.16b, v4.16b\n"
+ "mov v31.16b, v0.16b\n"
"tbnz %x[flags], #31, 117f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1r { v0.4s }, [x20]\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
- "neg v4.4s, v4.4s\n"
+ "neg v0.4s, v0.4s\n"
"dup v12.4s, v11.s[3]\n"
"dup v11.4s, v11.s[0]\n"
"dup v14.4s, v13.s[3]\n"
"dup v13.4s, v13.s[0]\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "mul v12.4s, v12.4s, v4.4s\n"
- "mul v13.4s, v13.4s, v4.4s\n"
- "mul v14.4s, v14.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v0.4s\n"
+ "mul v12.4s, v12.4s, v0.4s\n"
+ "mul v13.4s, v13.4s, v0.4s\n"
+ "mul v14.4s, v14.4s, v0.4s\n"
"117:" // Height 4: skip row sum fixup
"ldr q0, [x10, #0x0]\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q4, [x10, #0x10]\n"
"add v31.4s, v31.4s, v11.4s\n"
"add v20.4s, v20.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
"add v21.4s, v21.4s, v11.4s\n"
"add v22.4s, v22.4s, v11.4s\n"
"add v16.4s, v16.4s, v12.4s\n"
"add v17.4s, v17.4s, v12.4s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v1.4s }, [x20]\n"
"add v18.4s, v18.4s, v12.4s\n"
"add v19.4s, v19.4s, v12.4s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v23.4s, v23.4s, v13.4s\n"
"add v28.4s, v28.4s, v13.4s\n"
"add x10, x10, #0x40\n"
@@ -1812,100 +1811,100 @@ void a64_hybrid_u8qa_mmla_4x16 (
"add v26.4s, v26.4s, v14.4s\n"
"add v27.4s, v27.4s, v14.4s\n"
"add v31.4s, v31.4s, v0.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v21.4s, v21.4s, v2.4s\n"
- "add v22.4s, v22.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v18.4s, v18.4s, v2.4s\n"
- "add v19.4s, v19.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v2.4s\n"
"add v23.4s, v23.4s, v0.4s\n"
- "add v28.4s, v28.4s, v1.4s\n"
- "add v29.4s, v29.4s, v2.4s\n"
- "add v30.4s, v30.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x23]\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v26.4s, v26.4s, v2.4s\n"
- "add v27.4s, v27.4s, v3.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v1.4s\n"
"tbz %x[flags], #5, 118f\n"
- "and v4.16b, v31.16b, v0.16b\n"
- "and v5.16b, v20.16b, v0.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v31.4s, v31.4s, v4.4s\n"
- "sqadd v20.4s, v20.4s, v5.4s\n"
- "and v6.16b, v21.16b, v0.16b\n"
- "and v7.16b, v22.16b, v0.16b\n"
- "and v8.16b, v16.16b, v0.16b\n"
- "and v9.16b, v17.16b, v0.16b\n"
- "and v10.16b, v18.16b, v0.16b\n"
- "and v4.16b, v19.16b, v0.16b\n"
- "and v5.16b, v23.16b, v0.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v2.16b, v31.16b, v0.16b\n"
+ "and v1.16b, v20.16b, v0.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v2.4s\n"
+ "sqadd v20.4s, v20.4s, v1.4s\n"
+ "and v7.16b, v21.16b, v0.16b\n"
+ "and v6.16b, v22.16b, v0.16b\n"
+ "and v5.16b, v16.16b, v0.16b\n"
+ "and v4.16b, v17.16b, v0.16b\n"
+ "and v3.16b, v18.16b, v0.16b\n"
+ "and v2.16b, v19.16b, v0.16b\n"
+ "and v1.16b, v23.16b, v0.16b\n"
"sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v21.4s, v21.4s, v6.4s\n"
- "sqadd v22.4s, v22.4s, v7.4s\n"
- "sqadd v16.4s, v16.4s, v8.4s\n"
- "sqadd v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sqadd v23.4s, v23.4s, v5.4s\n"
- "and v6.16b, v28.16b, v0.16b\n"
- "and v7.16b, v29.16b, v0.16b\n"
- "and v8.16b, v30.16b, v0.16b\n"
- "and v9.16b, v24.16b, v0.16b\n"
- "and v10.16b, v25.16b, v0.16b\n"
- "and v4.16b, v26.16b, v0.16b\n"
- "and v5.16b, v27.16b, v0.16b\n"
"sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
"sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v7.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "sqadd v19.4s, v19.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v1.4s\n"
+ "and v7.16b, v28.16b, v0.16b\n"
+ "and v6.16b, v29.16b, v0.16b\n"
+ "and v5.16b, v30.16b, v0.16b\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "and v3.16b, v25.16b, v0.16b\n"
+ "and v2.16b, v26.16b, v0.16b\n"
+ "and v1.16b, v27.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
"sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v6.4s\n"
- "sqadd v29.4s, v29.4s, v7.4s\n"
- "sqadd v30.4s, v30.4s, v8.4s\n"
- "sqadd v24.4s, v24.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v10.4s\n"
- "sqadd v26.4s, v26.4s, v4.4s\n"
- "sqadd v27.4s, v27.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v7.4s\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "sqadd v30.4s, v30.4s, v5.4s\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "sqadd v25.4s, v25.4s, v3.4s\n"
+ "sqadd v26.4s, v26.4s, v2.4s\n"
+ "sqadd v27.4s, v27.4s, v1.4s\n"
"118:" // Height 4: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1r { v3.4s }, [x20]\n"
"srshl v31.4s, v31.4s, v0.4s\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1r { v6.4s }, [x23]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1r { v2.4s }, [x20]\n"
"srshl v16.4s, v16.4s, v0.4s\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1r { v5.4s }, [x23]\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1r { v1.4s }, [x20]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
"srshl v19.4s, v19.4s, v0.4s\n"
"cmp x9, #0x10\n"
@@ -1917,163 +1916,163 @@ void a64_hybrid_u8qa_mmla_4x16 (
"srshl v25.4s, v25.4s, v0.4s\n"
"srshl v26.4s, v26.4s, v0.4s\n"
"srshl v27.4s, v27.4s, v0.4s\n"
- "add v31.4s, v31.4s, v4.4s\n"
- "add v20.4s, v20.4s, v4.4s\n"
- "add v21.4s, v21.4s, v4.4s\n"
- "add v22.4s, v22.4s, v4.4s\n"
- "add v16.4s, v16.4s, v4.4s\n"
- "add v17.4s, v17.4s, v4.4s\n"
- "add v18.4s, v18.4s, v4.4s\n"
- "add v19.4s, v19.4s, v4.4s\n"
- "add v23.4s, v23.4s, v4.4s\n"
- "add v28.4s, v28.4s, v4.4s\n"
- "add v29.4s, v29.4s, v4.4s\n"
- "add v30.4s, v30.4s, v4.4s\n"
- "add v24.4s, v24.4s, v4.4s\n"
- "add v25.4s, v25.4s, v4.4s\n"
- "add v26.4s, v26.4s, v4.4s\n"
- "add v27.4s, v27.4s, v4.4s\n"
- "smin v31.4s, v31.4s, v6.4s\n"
- "smin v20.4s, v20.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v6.4s\n"
- "smin v22.4s, v22.4s, v6.4s\n"
- "smin v16.4s, v16.4s, v6.4s\n"
- "smin v17.4s, v17.4s, v6.4s\n"
- "smin v18.4s, v18.4s, v6.4s\n"
- "smin v19.4s, v19.4s, v6.4s\n"
- "smin v23.4s, v23.4s, v6.4s\n"
- "smin v28.4s, v28.4s, v6.4s\n"
- "smin v29.4s, v29.4s, v6.4s\n"
- "smin v30.4s, v30.4s, v6.4s\n"
- "smin v24.4s, v24.4s, v6.4s\n"
- "smin v25.4s, v25.4s, v6.4s\n"
- "smin v26.4s, v26.4s, v6.4s\n"
- "smin v27.4s, v27.4s, v6.4s\n"
- "smax v31.4s, v31.4s, v5.4s\n"
- "smax v20.4s, v20.4s, v5.4s\n"
- "smax v21.4s, v21.4s, v5.4s\n"
- "smax v22.4s, v22.4s, v5.4s\n"
- "smax v16.4s, v16.4s, v5.4s\n"
- "smax v17.4s, v17.4s, v5.4s\n"
- "smax v18.4s, v18.4s, v5.4s\n"
- "smax v19.4s, v19.4s, v5.4s\n"
- "smax v23.4s, v23.4s, v5.4s\n"
- "smax v28.4s, v28.4s, v5.4s\n"
- "smax v29.4s, v29.4s, v5.4s\n"
- "smax v30.4s, v30.4s, v5.4s\n"
- "smax v24.4s, v24.4s, v5.4s\n"
- "smax v25.4s, v25.4s, v5.4s\n"
- "smax v26.4s, v26.4s, v5.4s\n"
- "smax v27.4s, v27.4s, v5.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v16.4s, v16.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v3.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "smin v31.4s, v31.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v2.4s\n"
+ "smin v21.4s, v21.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v2.4s\n"
+ "smin v17.4s, v17.4s, v2.4s\n"
+ "smin v18.4s, v18.4s, v2.4s\n"
+ "smin v19.4s, v19.4s, v2.4s\n"
+ "smin v23.4s, v23.4s, v2.4s\n"
+ "smin v28.4s, v28.4s, v2.4s\n"
+ "smin v29.4s, v29.4s, v2.4s\n"
+ "smin v30.4s, v30.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v2.4s\n"
+ "smin v25.4s, v25.4s, v2.4s\n"
+ "smin v26.4s, v26.4s, v2.4s\n"
+ "smin v27.4s, v27.4s, v2.4s\n"
+ "smax v31.4s, v31.4s, v1.4s\n"
+ "smax v20.4s, v20.4s, v1.4s\n"
+ "smax v21.4s, v21.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v1.4s\n"
+ "smax v16.4s, v16.4s, v1.4s\n"
+ "smax v17.4s, v17.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v1.4s\n"
+ "smax v19.4s, v19.4s, v1.4s\n"
+ "smax v23.4s, v23.4s, v1.4s\n"
+ "smax v28.4s, v28.4s, v1.4s\n"
+ "smax v29.4s, v29.4s, v1.4s\n"
+ "smax v30.4s, v30.4s, v1.4s\n"
+ "smax v24.4s, v24.4s, v1.4s\n"
+ "smax v25.4s, v25.4s, v1.4s\n"
+ "smax v26.4s, v26.4s, v1.4s\n"
+ "smax v27.4s, v27.4s, v1.4s\n"
"uzp1 v31.8h, v31.8h, v20.8h\n"
"uzp1 v20.8h, v21.8h, v22.8h\n"
"uzp1 v16.8h, v16.8h, v17.8h\n"
- "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v19.8h, v18.8h, v19.8h\n"
"uzp1 v23.8h, v23.8h, v28.8h\n"
- "uzp1 v28.8h, v29.8h, v30.8h\n"
+ "uzp1 v18.8h, v29.8h, v30.8h\n"
"uzp1 v24.8h, v24.8h, v25.8h\n"
- "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v17.8h, v26.8h, v27.8h\n"
"uzp1 v31.16b, v31.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v17.16b\n"
- "uzp1 v23.16b, v23.16b, v28.16b\n"
- "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v16.16b, v16.16b, v19.16b\n"
+ "uzp1 v23.16b, v23.16b, v18.16b\n"
+ "uzp1 v24.16b, v24.16b, v17.16b\n"
"bge 127f\n"
"tbz x9, #3, 122f\n"
"str d31, [x27], #0x8\n"
- "str d16, [x22], #0x8\n"
- "str d23, [x21], #0x8\n"
- "str d24, [x20], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
"tbz x9, #2, 120f\n"
"st1 { v31.s }[2], [x27], #0x4\n"
- "st1 { v16.s }[2], [x22], #0x4\n"
- "st1 { v23.s }[2], [x21], #0x4\n"
- "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
"tbz x9, #1, 119f\n"
"st1 { v31.h }[6], [x27], #0x2\n"
- "st1 { v16.h }[6], [x22], #0x2\n"
- "st1 { v23.h }[6], [x21], #0x2\n"
- "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v16.h }[6], [x23], #0x2\n"
+ "st1 { v23.h }[6], [x22], #0x2\n"
+ "st1 { v24.h }[6], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[14], [x27]\n"
- "st1 { v16.b }[14], [x22]\n"
- "st1 { v23.b }[14], [x21]\n"
- "st1 { v24.b }[14], [x20]\n"
+ "st1 { v16.b }[14], [x23]\n"
+ "st1 { v23.b }[14], [x22]\n"
+ "st1 { v24.b }[14], [x21]\n"
"b 126f\n"
"119:" // Height 4: Partial direct writeback: partial_1_12
"tbz x9, #0, 126f\n"
"st1 { v31.b }[12], [x27]\n"
- "st1 { v16.b }[12], [x22]\n"
- "st1 { v23.b }[12], [x21]\n"
- "st1 { v24.b }[12], [x20]\n"
+ "st1 { v16.b }[12], [x23]\n"
+ "st1 { v23.b }[12], [x22]\n"
+ "st1 { v24.b }[12], [x21]\n"
"b 126f\n"
"120:" // Height 4: Partial direct writeback: partial_2_8
"tbz x9, #1, 121f\n"
"st1 { v31.h }[4], [x27], #0x2\n"
- "st1 { v16.h }[4], [x22], #0x2\n"
- "st1 { v23.h }[4], [x21], #0x2\n"
- "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v16.h }[4], [x23], #0x2\n"
+ "st1 { v23.h }[4], [x22], #0x2\n"
+ "st1 { v24.h }[4], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[10], [x27]\n"
- "st1 { v16.b }[10], [x22]\n"
- "st1 { v23.b }[10], [x21]\n"
- "st1 { v24.b }[10], [x20]\n"
+ "st1 { v16.b }[10], [x23]\n"
+ "st1 { v23.b }[10], [x22]\n"
+ "st1 { v24.b }[10], [x21]\n"
"b 126f\n"
"121:" // Height 4: Partial direct writeback: partial_1_8
"tbz x9, #0, 126f\n"
"st1 { v31.b }[8], [x27]\n"
- "st1 { v16.b }[8], [x22]\n"
- "st1 { v23.b }[8], [x21]\n"
- "st1 { v24.b }[8], [x20]\n"
+ "st1 { v16.b }[8], [x23]\n"
+ "st1 { v23.b }[8], [x22]\n"
+ "st1 { v24.b }[8], [x21]\n"
"b 126f\n"
"122:" // Height 4: Partial direct writeback: partial_4_0
"tbz x9, #2, 124f\n"
"str s31, [x27], #0x4\n"
- "str s16, [x22], #0x4\n"
- "str s23, [x21], #0x4\n"
- "str s24, [x20], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
"tbz x9, #1, 123f\n"
"st1 { v31.h }[2], [x27], #0x2\n"
- "st1 { v16.h }[2], [x22], #0x2\n"
- "st1 { v23.h }[2], [x21], #0x2\n"
- "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v16.h }[2], [x23], #0x2\n"
+ "st1 { v23.h }[2], [x22], #0x2\n"
+ "st1 { v24.h }[2], [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[6], [x27]\n"
- "st1 { v16.b }[6], [x22]\n"
- "st1 { v23.b }[6], [x21]\n"
- "st1 { v24.b }[6], [x20]\n"
+ "st1 { v16.b }[6], [x23]\n"
+ "st1 { v23.b }[6], [x22]\n"
+ "st1 { v24.b }[6], [x21]\n"
"b 126f\n"
"123:" // Height 4: Partial direct writeback: partial_1_4
"tbz x9, #0, 126f\n"
"st1 { v31.b }[4], [x27]\n"
- "st1 { v16.b }[4], [x22]\n"
- "st1 { v23.b }[4], [x21]\n"
- "st1 { v24.b }[4], [x20]\n"
+ "st1 { v16.b }[4], [x23]\n"
+ "st1 { v23.b }[4], [x22]\n"
+ "st1 { v24.b }[4], [x21]\n"
"b 126f\n"
"124:" // Height 4: Partial direct writeback: partial_2_0
"tbz x9, #1, 125f\n"
"str h31, [x27], #0x2\n"
- "str h16, [x22], #0x2\n"
- "str h23, [x21], #0x2\n"
- "str h24, [x20], #0x2\n"
+ "str h16, [x23], #0x2\n"
+ "str h23, [x22], #0x2\n"
+ "str h24, [x21], #0x2\n"
"tbz x9, #0, 126f\n"
"st1 { v31.b }[2], [x27]\n"
- "st1 { v16.b }[2], [x22]\n"
- "st1 { v23.b }[2], [x21]\n"
- "st1 { v24.b }[2], [x20]\n"
+ "st1 { v16.b }[2], [x23]\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "st1 { v24.b }[2], [x21]\n"
"b 126f\n"
"125:" // Height 4: Partial direct writeback: partial_1_0
"str b31, [x27, #0x0]\n"
- "str b16, [x22, #0x0]\n"
- "str b23, [x21, #0x0]\n"
- "str b24, [x20, #0x0]\n"
+ "str b16, [x23, #0x0]\n"
+ "str b23, [x22, #0x0]\n"
+ "str b24, [x21, #0x0]\n"
"126:" // Height 4: Partial direct writeback: Done
"b 128f\n"
"127:" // Height 4: Full writeback
"str q31, [x27, #0x0]\n"
"add x27, x27, #0x10\n"
- "str q16, [x22, #0x0]\n"
- "str q23, [x21, #0x0]\n"
- "str q24, [x20, #0x0]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q23, [x22, #0x0]\n"
+ "str q24, [x21, #0x0]\n"
"128:" // Height 4: Writeback done
"subs x9, x9, #0x10\n"
"bgt 98b\n"
@@ -2089,7 +2088,6 @@ void a64_hybrid_u8qa_mmla_4x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"130:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index ce96c1b28f..38bb7c646d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -121,5 +121,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
index 705f6525b6..7f0fad7fa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
@@ -77,7 +77,6 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
ka.N = N;
ka.B_ptr = B_ptr;
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 171f\n"
@@ -165,11 +164,11 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
"cbnz x15, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
@@ -186,129 +185,129 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"blt 18f\n"
"17:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr d17, [x16, #0x20]\n"
+ "ldr x20, [x16, #0x28]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x38]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- "ldr x12, [x16, #0x48]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x58]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x78]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x98]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xb8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xd8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xf8]\n"
- "mov v7.d[1], x11\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x38]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr d17, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr d16, [x16, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr d17, [x16, #0x60]\n"
+ "ldr x20, [x16, #0x68]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x78]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr d17, [x16, #0x80]\n"
+ "ldr x20, [x16, #0x88]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr d16, [x16, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr d17, [x16, #0xa0]\n"
+ "ldr x20, [x16, #0xa8]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr d17, [x16, #0xc0]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr d16, [x16, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr d17, [x16, #0xe0]\n"
+ "ldr x20, [x16, #0xe8]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "mov v16.d[1], x20\n"
"add x13, x13, #0x10\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0x8]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
"sub x14, x14, #0x10\n"
"ldr d7, [x16, #0x10]\n"
"cmp x14, #0x20\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x18]\n"
- "mov v0.d[1], x10\n"
- "mov v7.d[1], x11\n"
+ "ldr x21, [x13, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x21\n"
+ "mov v7.d[1], x20\n"
"prfm pldl1keep, [x13, #0x80]\n"
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x16, #0x40]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x50]\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x16, #0x60]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x16, #0x70]\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x16, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x16, #0x90]\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x16, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x16, #0xb0]\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x16, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x16, #0xd0]\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x16, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x16, #0xf0]\n"
"add x13, x13, #0x10\n"
"sub x14, x14, #0x10\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
"cbz x14, 24f\n"
"cmp x14, #0x4\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s18, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n"
+ "ldr q17, [x16, #0x20]\n"
"cmp x14, #0x4\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n"
"add x16, x16, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
@@ -321,14 +320,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x0]\n"
+ ".inst 0x6f80e208 // udot v8.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x20]\n"
+ ".inst 0x6f80e20a // udot v10.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -499,226 +498,226 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
"cbnz x15, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
- "add x9, x9, x20\n"
+ "add x12, x12, x20\n"
"b 50f\n"
"49:" // Height 2: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
+ "add x12, x13, x21\n"
"50:" // Height 2: input setup done
"cmp x14, #0x10\n"
"blt 53f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 52f\n"
"51:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d17, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x58]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x98]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0xd8]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v6.d[1], x12\n"
+ "ldr d16, [x16, #0x30]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr d17, [x16, #0x40]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr x20, [x16, #0x48]\n"
+ ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr d16, [x16, #0x50]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr d17, [x16, #0x60]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr d16, [x16, #0x70]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr d17, [x16, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr x20, [x16, #0x88]\n"
+ ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr d16, [x16, #0x90]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0x98]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr d17, [x16, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr d16, [x16, #0xb0]\n"
+ "mov v17.d[1], x21\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr d17, [x16, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr d16, [x16, #0xd0]\n"
+ "mov v17.d[1], x20\n"
+ "ldr x20, [x16, #0xd8]\n"
+ "mov v16.d[1], x20\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr d17, [x16, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr d16, [x16, #0xf0]\n"
+ "mov v17.d[1], x21\n"
"add x13, x13, #0x10\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
+ "mov v16.d[1], x20\n"
+ "add x12, x12, #0x10\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
"sub x14, x14, #0x10\n"
"ldr d7, [x16, #0x10]\n"
"cmp x14, #0x20\n"
- "ldr x10, [x13, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x28, [x9, #0x8]\n"
- "mov v0.d[1], x10\n"
- "ldr x11, [x16, #0x18]\n"
- "mov v1.d[1], x28\n"
+ "ldr x20, [x13, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "ldr x21, [x12, #0x8]\n"
+ "mov v0.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v1.d[1], x21\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v7.d[1], x11\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "mov v7.d[1], x20\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q17, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
"sub x14, x14, #0x10\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x16, #0x40]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x16, #0x50]\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x16, #0x60]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x16, #0x70]\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x16, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x16, #0x90]\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x16, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x16, #0xb0]\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x16, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x16, #0xd0]\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x16, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x16, #0xf0]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
"53:" // Height 2: Multiply loop: Main loop skip
"cbz x14, 58f\n"
"cmp x14, #0x4\n"
"blt 55f\n"
"54:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s19, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s18, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q17, [x16, #0x0]\n"
+ ".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n"
+ "ldr q17, [x16, #0x20]\n"
+ ".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n"
"bge 54b\n"
"55:" // Height 2: Multiply loop: Skip odd blocks
"cbz x14, 58f\n"
"tbz x14, #1, 56f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
"tbz x14, #0, 57f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x12]\n"
"b 57f\n"
"56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
"57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q17, [x16, #0x0]\n"
+ ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
+ "ldr q16, [x16, #0x10]\n"
+ ".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x16, #0x20]\n"
+ ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x16, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
"58:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -936,281 +935,281 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
"cbnz x15, 84f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
- "add x9, x9, x20\n"
- "add x27, x27, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
"b 84f\n"
"83:" // Height 3: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
"84:" // Height 3: input setup done
"cmp x14, #0x10\n"
"blt 87f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 86f\n"
"85:" // Height 3: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d21, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v21.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr d20, [x16, #0x30]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr d21, [x16, #0x40]\n"
+ ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr d20, [x16, #0x50]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr d21, [x16, #0x60]\n"
+ ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr d20, [x16, #0x70]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr d21, [x16, #0x80]\n"
+ ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr d20, [x16, #0x90]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr d21, [x16, #0xa0]\n"
+ ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr d20, [x16, #0xb0]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xd8]\n"
+ ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr d21, [x16, #0xc0]\n"
+ ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr d20, [x16, #0xd0]\n"
+ "mov v20.d[1], x20\n"
+ ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr d21, [x16, #0xe0]\n"
+ ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
+ "mov v21.d[1], x21\n"
+ ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
"add x13, x13, #0x10\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
+ ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr d20, [x16, #0xf0]\n"
+ "mov v20.d[1], x20\n"
+ "add x12, x12, #0x10\n"
+ "add x11, x11, #0x10\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x10, [x13, #0x8]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
+ "ldr x20, [x16, #0x8]\n"
+ ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
+ "ldr x23, [x13, #0x8]\n"
+ ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ "ldr x22, [x12, #0x8]\n"
+ ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
+ "ldr d2, [x11, #0x0]\n"
"sub x14, x14, #0x10\n"
"ldr d7, [x16, #0x10]\n"
"cmp x14, #0x20\n"
- "ldr x26, [x27, #0x8]\n"
- "mov v6.d[1], x12\n"
- "ldr x11, [x16, #0x18]\n"
- "mov v0.d[1], x10\n"
+ "ldr x21, [x11, #0x8]\n"
+ "mov v6.d[1], x20\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v0.d[1], x23\n"
"prfm pldl1keep, [x13, #0x80]\n"
- "mov v1.d[1], x28\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- "mov v2.d[1], x26\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- "mov v7.d[1], x11\n"
+ "mov v1.d[1], x22\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "mov v2.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v7.d[1], x20\n"
"bge 85b\n"
"86:" // Height 3: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q21, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q20, [x16, #0x30]\n"
+ ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x16, #0x40]\n"
+ ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x16, #0x50]\n"
+ ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x16, #0x60]\n"
+ ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x16, #0x70]\n"
+ ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x16, #0x80]\n"
+ ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x16, #0x90]\n"
+ ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x16, #0xa0]\n"
+ ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x16, #0xb0]\n"
+ ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x16, #0xc0]\n"
+ ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x16, #0xd0]\n"
+ ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x16, #0xe0]\n"
+ ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x16, #0xf0]\n"
+ ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
"87:" // Height 3: Multiply loop: Main loop skip
"cbz x14, 92f\n"
"cmp x14, #0x4\n"
"blt 89f\n"
"88:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s24, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s23, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s22, [x11], #0x4\n"
+ "ldr q21, [x16, #0x0]\n"
+ ".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
+ ".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n"
+ "ldr q21, [x16, #0x20]\n"
+ ".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n"
+ "ldr q20, [x16, #0x30]\n"
+ ".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n"
+ ".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x6f97e28f // udot v15.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x6f96e293 // udot v19.4s, v20.16b, v22.4b[0]\n"
"bge 88b\n"
"89:" // Height 3: Multiply loop: Skip odd blocks
"cbz x14, 92f\n"
"tbz x14, #1, 90f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
"tbz x14, #0, 91f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
"b 91f\n"
"90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
"91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q21, [x16, #0x0]\n"
+ ".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n"
+ "ldr q20, [x16, #0x10]\n"
+ ".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x16, #0x20]\n"
+ ".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x16, #0x30]\n"
+ ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
+ ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
"92:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -1475,336 +1474,336 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
"cbnz x15, 118f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
- "add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"b 118f\n"
"117:" // Height 4: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
"118:" // Height 4: input setup done
"cmp x14, #0x10\n"
"blt 121f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 120f\n"
"119:" // Height 4: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d25, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v25.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "add x25, x25, #0x10\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr x10, [x13, #0x8]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr x26, [x27, #0x8]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr x24, [x25, #0x8]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr d24, [x16, #0x30]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr d25, [x16, #0x40]\n"
+ ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr d24, [x16, #0x50]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
+ "ldr x25, [x13, #0x8]\n"
+ ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr d25, [x16, #0x60]\n"
+ ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
+ "ldr x24, [x12, #0x8]\n"
+ ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr d24, [x16, #0x70]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
+ "ldr x23, [x11, #0x8]\n"
+ ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr d25, [x16, #0x80]\n"
+ ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
+ "ldr x22, [x10, #0x8]\n"
+ ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr d24, [x16, #0x90]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
"sub x14, x14, #0x10\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr d25, [x16, #0xa0]\n"
+ ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
"cmp x14, #0x20\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr d24, [x16, #0xb0]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xd8]\n"
+ ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr d25, [x16, #0xc0]\n"
+ ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr d24, [x16, #0xd0]\n"
+ "mov v24.d[1], x20\n"
+ ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr d25, [x16, #0xe0]\n"
+ ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
+ "mov v25.d[1], x21\n"
+ ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr d24, [x16, #0xf0]\n"
+ "mov v24.d[1], x20\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0x18]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
+ ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0x18]\n"
+ ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
+ "ldr d2, [x11, #0x0]\n"
+ ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
+ "ldr d3, [x10, #0x0]\n"
"ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
- "mov v7.d[1], x11\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x25\n"
+ "mov v1.d[1], x24\n"
+ "mov v2.d[1], x23\n"
+ "mov v3.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 119b\n"
"120:" // Height 4: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q25, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q24, [x16, #0x30]\n"
+ ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x16, #0x40]\n"
+ ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x16, #0x50]\n"
+ ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x16, #0x60]\n"
+ ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x16, #0x70]\n"
+ ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x16, #0x80]\n"
+ ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x16, #0x90]\n"
+ ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x16, #0xa0]\n"
+ ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x16, #0xb0]\n"
+ ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x16, #0xc0]\n"
+ ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x16, #0xd0]\n"
+ ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x16, #0xe0]\n"
+ ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x16, #0xf0]\n"
+ ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
"121:" // Height 4: Multiply loop: Main loop skip
"cbz x14, 126f\n"
"cmp x14, #0x4\n"
"blt 123f\n"
"122:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s29, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s28, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s27, [x11], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr q25, [x16, #0x0]\n"
+ ".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
+ ".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n"
+ "ldr q25, [x16, #0x20]\n"
+ ".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n"
+ "ldr q24, [x16, #0x30]\n"
+ ".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n"
+ ".inst 0x6f9de30b // udot v11.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x6f9ce30f // udot v15.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x6f9be313 // udot v19.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae317 // udot v23.4s, v24.16b, v26.4b[0]\n"
"bge 122b\n"
"123:" // Height 4: Multiply loop: Skip odd blocks
"cbz x14, 126f\n"
"tbz x14, #1, 124f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
"tbz x14, #0, 125f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
+ "ld1 { v3.b }[2], [x10]\n"
"b 125f\n"
"124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
+ "ldr b3, [x10, #0x0]\n"
"125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q25, [x16, #0x0]\n"
+ ".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n"
+ "ldr q24, [x16, #0x10]\n"
+ ".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x16, #0x20]\n"
+ ".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x16, #0x30]\n"
+ ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
+ ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
"126:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -2116,391 +2115,391 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
"cbnz x15, 152f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
- "add x23, x23, x20\n"
"b 152f\n"
"151:" // Height 5: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
- "add x23, x25, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
"152:" // Height 5: input setup done
"cmp x14, #0x10\n"
"blt 155f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 154f\n"
"153:" // Height 5: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr d6, [x16, #0x20]\n"
+ "ldr d29, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v29.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x10, [x13, #0x8]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr d6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr x28, [x9, #0x8]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x26, [x27, #0x8]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr x24, [x25, #0x8]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr x22, [x23, #0x8]\n"
- ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr d6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr d28, [x16, #0x30]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr d29, [x16, #0x40]\n"
+ ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ "ldr x21, [x16, #0x68]\n"
+ ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
+ "ldr x25, [x12, #0x8]\n"
+ ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
+ "ldr x24, [x11, #0x8]\n"
+ ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr d28, [x16, #0x50]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
+ "ldr x23, [x10, #0x8]\n"
+ ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
+ "ldr x22, [x9, #0x8]\n"
+ ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr d29, [x16, #0x60]\n"
+ ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
"sub x14, x14, #0x10\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
"cmp x14, #0x20\n"
- ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr d28, [x16, #0x70]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
"prfm pldl1keep, [x13, #0x80]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr d29, [x16, #0x80]\n"
+ ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr d28, [x16, #0x90]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
"prfm pldl1keep, [x9, #0x80]\n"
- ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr d6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr d6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr d6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr d6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr d29, [x16, #0xa0]\n"
+ ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr d28, [x16, #0xb0]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
+ "ldr x20, [x16, #0xd8]\n"
+ ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr d29, [x16, #0xc0]\n"
+ ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr d28, [x16, #0xd0]\n"
+ "mov v28.d[1], x20\n"
+ ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr d29, [x16, #0xe0]\n"
+ ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
+ "mov v29.d[1], x21\n"
+ ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr d28, [x16, #0xf0]\n"
+ "mov v28.d[1], x20\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0x18]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
+ "ldr x21, [x16, #0x8]\n"
+ ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
+ "ldr x20, [x16, #0x18]\n"
+ ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
"ldr d6, [x16, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
- ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
+ "ldr d1, [x12, #0x0]\n"
+ ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
+ "ldr d2, [x11, #0x0]\n"
+ ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
+ "ldr d3, [x10, #0x0]\n"
+ ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
+ "ldr d4, [x9, #0x0]\n"
"ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
- "mov v3.d[1], x24\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x26\n"
+ "mov v1.d[1], x25\n"
+ "mov v2.d[1], x24\n"
+ "mov v3.d[1], x23\n"
"mov v4.d[1], x22\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"bge 153b\n"
"154:" // Height 5: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
+ "ldr q29, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x16, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x16, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x16, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x16, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x16, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x16, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x16, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x16, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x16, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x16, #0xf0]\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q28, [x16, #0x30]\n"
+ ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x16, #0x40]\n"
+ ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x16, #0x50]\n"
+ ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x16, #0x60]\n"
+ ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x16, #0x70]\n"
+ ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x16, #0x80]\n"
+ ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x16, #0x90]\n"
+ ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x16, #0xa0]\n"
+ ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x16, #0xb0]\n"
+ ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x16, #0xc0]\n"
+ ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x16, #0xd0]\n"
+ ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x16, #0xe0]\n"
+ ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x16, #0xf0]\n"
+ ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
"add x16, x16, #0x100\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
"155:" // Height 5: Multiply loop: Main loop skip
"cbz x14, 160f\n"
"cmp x14, #0x4\n"
"blt 157f\n"
"156:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s2, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s0, [x11], #0x4\n"
+ "ldr s31, [x10], #0x4\n"
+ "ldr s30, [x9], #0x4\n"
+ "ldr q29, [x16, #0x0]\n"
+ ".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
+ ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n"
+ "ldr q29, [x16, #0x20]\n"
+ ".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n"
+ "ldr q28, [x16, #0x30]\n"
+ ".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee3ba // udot v26.4s, v29.16b, v30.4b[0]\n"
+ ".inst 0x6f82e38b // udot v11.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe397 // udot v23.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee39b // udot v27.4s, v28.16b, v30.4b[0]\n"
"bge 156b\n"
"157:" // Height 5: Multiply loop: Skip odd blocks
"cbz x14, 160f\n"
"tbz x14, #1, 158f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h4, [x9], #0x2\n"
"tbz x14, #0, 159f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
+ "ld1 { v3.b }[2], [x10]\n"
+ "ld1 { v4.b }[2], [x9]\n"
"b 159f\n"
"158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
+ "ldr b3, [x10, #0x0]\n"
+ "ldr b4, [x9, #0x0]\n"
"159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q29, [x16, #0x0]\n"
+ ".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n"
+ "ldr q28, [x16, #0x10]\n"
+ ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x16, #0x20]\n"
+ ".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x16, #0x30]\n"
+ ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
+ ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
"160:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -2862,98 +2861,98 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w14, [x20, x15, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
- "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x13, [x21, #0x0]\n"
- "ldr x9, [x21, #0x8]\n"
- "ldr x27, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x23, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x13, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x11, [x20, #0x10]\n"
+ "ldr x10, [x20, #0x18]\n"
+ "ldr x9, [x20, #0x20]\n"
+ "ldr x28, [x20, #0x28]\n"
"cbnz x15, 186f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x13, x13, x20\n"
+ "add x12, x12, x20\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"add x9, x9, x20\n"
- "add x27, x27, x20\n"
- "add x25, x25, x20\n"
- "add x23, x23, x20\n"
- "add x21, x21, x20\n"
+ "add x28, x28, x20\n"
"b 186f\n"
"185:" // Height 6: setup direct input
"mov x13, %x[input_ptr]\n"
- "add x9, x13, x20\n"
- "add x27, x9, x20\n"
- "add x25, x27, x20\n"
- "add x23, x25, x20\n"
- "add x21, x23, x20\n"
+ "add x12, x13, x21\n"
+ "add x11, x12, x21\n"
+ "add x10, x11, x21\n"
+ "add x9, x10, x21\n"
+ "add x28, x9, x21\n"
"186:" // Height 6: input setup done
"cmp x14, #0x10\n"
"blt 189f\n"
"ldr q0, [x13, #0x0]\n"
"cmp x14, #0x20\n"
- "ldr q1, [x9, #0x0]\n"
- "ldr q2, [x27, #0x0]\n"
- "ldr q3, [x25, #0x0]\n"
- "ldr q4, [x23, #0x0]\n"
- "ldr q5, [x21, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x11, #0x0]\n"
+ "ldr q3, [x10, #0x0]\n"
+ "ldr q4, [x9, #0x0]\n"
+ "ldr q5, [x28, #0x0]\n"
"ldr q6, [x16, #0x0]\n"
"ldr q7, [x16, #0x10]\n"
"blt 188f\n"
"187:" // Height 6: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr x12, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x28]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x38]\n"
+ "ldr x20, [x16, #0x38]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
"ldr d6, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x48]\n"
+ "ldr x21, [x16, #0x48]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
"ldr d7, [x16, #0x30]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr x11, [x16, #0x58]\n"
+ "ldr x20, [x16, #0x58]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr x10, [x13, #0x8]\n"
+ "ldr x27, [x13, #0x8]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr x28, [x9, #0x8]\n"
+ "ldr x26, [x12, #0x8]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr x26, [x27, #0x8]\n"
+ "ldr x25, [x11, #0x8]\n"
".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
"ldr d6, [x16, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr x12, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x68]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr x24, [x25, #0x8]\n"
+ "ldr x24, [x10, #0x8]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr x22, [x23, #0x8]\n"
+ "ldr x23, [x9, #0x8]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr x20, [x21, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
"ldr d7, [x16, #0x50]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x78]\n"
+ "ldr x20, [x16, #0x78]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
"sub x14, x14, #0x10\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
@@ -2963,96 +2962,96 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
"ldr d6, [x16, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0x88]\n"
+ "ldr x21, [x16, #0x88]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
"ldr d7, [x16, #0x70]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr x11, [x16, #0x98]\n"
+ "ldr x20, [x16, #0x98]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
"ldr d6, [x16, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr x12, [x16, #0xa8]\n"
+ "ldr x21, [x16, #0xa8]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
"ldr d7, [x16, #0x90]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xb8]\n"
+ "ldr x20, [x16, #0xb8]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
"ldr d6, [x16, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xc8]\n"
+ "ldr x21, [x16, #0xc8]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
"ldr d7, [x16, #0xb0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr x11, [x16, #0xd8]\n"
+ "ldr x20, [x16, #0xd8]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
"ldr d6, [x16, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr x12, [x16, #0xe8]\n"
+ "ldr x21, [x16, #0xe8]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
"ldr d7, [x16, #0xd0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0xf8]\n"
+ "ldr x20, [x16, #0xf8]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
"ldr d6, [x16, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "mov v6.d[1], x12\n"
+ "mov v6.d[1], x21\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
"ldr d7, [x16, #0xf0]\n"
- "mov v7.d[1], x11\n"
+ "mov v7.d[1], x20\n"
"add x16, x16, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "ldr x12, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x8]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- "ldr x11, [x16, #0x18]\n"
+ "ldr x20, [x16, #0x18]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
@@ -3061,56 +3060,56 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
"ldr d0, [x13, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "ldr d1, [x9, #0x0]\n"
+ "ldr d1, [x12, #0x0]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- "ldr d2, [x27, #0x0]\n"
+ "ldr d2, [x11, #0x0]\n"
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- "ldr d3, [x25, #0x0]\n"
+ "ldr d3, [x10, #0x0]\n"
".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
- "ldr d4, [x23, #0x0]\n"
+ "ldr d4, [x9, #0x0]\n"
".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
- "ldr d5, [x21, #0x0]\n"
+ "ldr d5, [x28, #0x0]\n"
"ldr d7, [x16, #0x10]\n"
- "mov v6.d[1], x12\n"
- "mov v0.d[1], x10\n"
- "mov v1.d[1], x28\n"
- "mov v2.d[1], x26\n"
+ "mov v6.d[1], x21\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x26\n"
+ "mov v2.d[1], x25\n"
"mov v3.d[1], x24\n"
- "mov v4.d[1], x22\n"
- "mov v5.d[1], x20\n"
- "mov v7.d[1], x11\n"
+ "mov v4.d[1], x23\n"
+ "mov v5.d[1], x22\n"
+ "mov v7.d[1], x20\n"
"bge 187b\n"
"188:" // Height 6: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
"add x13, x13, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x9, x9, #0x10\n"
+ "add x12, x12, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x27, x27, #0x10\n"
+ "add x11, x11, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "add x25, x25, #0x10\n"
+ "add x10, x10, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "add x23, x23, #0x10\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
"ldr q6, [x16, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x21, x21, #0x10\n"
+ "add x28, x28, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
"sub x14, x14, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x13, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x9, #0x80]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
"ldr q7, [x16, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
@@ -3210,98 +3209,98 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"cmp x14, #0x4\n"
"blt 191f\n"
"190:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x13], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
"sub x14, x14, #0x4\n"
- "ldr s1, [x9], #0x4\n"
+ "ldr s6, [x12], #0x4\n"
"cmp x14, #0x4\n"
- "ldr s2, [x27], #0x4\n"
- "ldr s3, [x25], #0x4\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr s5, [x11], #0x4\n"
+ "ldr s4, [x10], #0x4\n"
+ "ldr s3, [x9], #0x4\n"
+ "ldr s2, [x28], #0x4\n"
+ "ldr q1, [x16, #0x0]\n"
+ ".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ ".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n"
+ "ldr q1, [x16, #0x20]\n"
+ ".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n"
+ "ldr q0, [x16, #0x30]\n"
+ ".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x6f83e03a // udot v26.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x6f82e03e // udot v30.4s, v1.16b, v2.4b[0]\n"
+ ".inst 0x6f87e00b // udot v11.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x6f86e00f // udot v15.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x6f85e013 // udot v19.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x6f84e017 // udot v23.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x6f83e01b // udot v27.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x6f82e01f // udot v31.4s, v0.16b, v2.4b[0]\n"
"bge 190b\n"
"191:" // Height 6: Multiply loop: Skip odd blocks
"cbz x14, 194f\n"
"tbz x14, #1, 192f\n"
"ldr h0, [x13], #0x2\n"
- "ldr h1, [x9], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h3, [x25], #0x2\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h5, [x21], #0x2\n"
+ "ldr h1, [x12], #0x2\n"
+ "ldr h2, [x11], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h5, [x28], #0x2\n"
"tbz x14, #0, 193f\n"
"ld1 { v0.b }[2], [x13]\n"
- "ld1 { v1.b }[2], [x9]\n"
- "ld1 { v2.b }[2], [x27]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v5.b }[2], [x21]\n"
+ "ld1 { v1.b }[2], [x12]\n"
+ "ld1 { v2.b }[2], [x11]\n"
+ "ld1 { v3.b }[2], [x10]\n"
+ "ld1 { v4.b }[2], [x9]\n"
+ "ld1 { v5.b }[2], [x28]\n"
"b 193f\n"
"192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x13, #0x0]\n"
- "ldr b1, [x9, #0x0]\n"
- "ldr b2, [x27, #0x0]\n"
- "ldr b3, [x25, #0x0]\n"
- "ldr b4, [x23, #0x0]\n"
- "ldr b5, [x21, #0x0]\n"
+ "ldr b1, [x12, #0x0]\n"
+ "ldr b2, [x11, #0x0]\n"
+ "ldr b3, [x10, #0x0]\n"
+ "ldr b4, [x9, #0x0]\n"
+ "ldr b5, [x28, #0x0]\n"
"193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x16, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x16, #0x10]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x16, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x16, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x16, #0x0]\n"
+ ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x16, #0x10]\n"
+ ".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x16, #0x20]\n"
+ ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x16, #0x30]\n"
+ ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n"
"add x16, x16, #0x40\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fa // udot v26.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fe // udot v30.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0cb // udot v11.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cf // udot v15.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d3 // udot v19.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d7 // udot v23.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0db // udot v27.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0df // udot v31.4s, v6.16b, v5.4b[0]\n"
"194:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x15, x15, #0x1\n"
@@ -3488,7 +3487,6 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
index 38131cfd4b..849c680843 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -77,7 +77,6 @@ void a64_hybrid_u8u32_dot_6x16 (
ka.N = N;
ka.B_ptr = B_ptr;
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 171f\n"
@@ -165,11 +164,11 @@ void a64_hybrid_u8u32_dot_6x16 (
"14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 15f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -186,37 +185,37 @@ void a64_hybrid_u8u32_dot_6x16 (
"blt 18f\n"
"17:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
"cmp x27, #0x20\n"
"add x10, x10, #0x100\n"
@@ -226,37 +225,37 @@ void a64_hybrid_u8u32_dot_6x16 (
"bge 17b\n"
"18:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"19:" // Height 1: Multiply loop: Main loop skip
@@ -264,17 +263,17 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x27, #0x4\n"
"blt 21f\n"
"20:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr q16, [x10, #0x0]\n"
+ ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n"
"sub x27, x27, #0x4\n"
- "ldr q7, [x10, #0x10]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n"
"cmp x27, #0x4\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n"
+ ".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n"
"add x10, x10, #0x40\n"
"bge 20b\n"
"21:" // Height 1: Multiply loop: Skip odd blocks
@@ -287,14 +286,14 @@ void a64_hybrid_u8u32_dot_6x16 (
"22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b0, [x26, #0x0]\n"
"23:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
"add x10, x10, #0x40\n"
"24:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -465,12 +464,12 @@ void a64_hybrid_u8u32_dot_6x16 (
"48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 49f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 50f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -478,7 +477,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"b 50f\n"
"49:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"50:" // Height 2: input setup done
"cmp x27, #0x10\n"
"blt 53f\n"
@@ -491,137 +490,137 @@ void a64_hybrid_u8u32_dot_6x16 (
"51:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"sub x27, x27, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x26, x26, #0x10\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"cmp x27, #0x20\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 51b\n"
"52:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x20]\n"
"add x26, x26, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q16, [x10, #0x30]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x40]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x10, #0x50]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
+ ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x70]\n"
+ ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
+ "ldr q16, [x10, #0x90]\n"
+ ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xa0]\n"
+ ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xb0]\n"
+ ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
+ "ldr q17, [x10, #0xc0]\n"
+ ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
+ "ldr q16, [x10, #0xd0]\n"
+ ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
+ "ldr q17, [x10, #0xe0]\n"
+ ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
+ "ldr q16, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
+ ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
"53:" // Height 2: Multiply loop: Main loop skip
"cbz x27, 58f\n"
"cmp x27, #0x4\n"
"blt 55f\n"
"54:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s19, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n"
+ ".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n"
+ ".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n"
+ ".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n"
"bge 54b\n"
"55:" // Height 2: Multiply loop: Skip odd blocks
"cbz x27, 58f\n"
@@ -636,19 +635,19 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr b0, [x26, #0x0]\n"
"ldr b1, [x25, #0x0]\n"
"57:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q16, [x10, #0x10]\n"
+ ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n"
+ "ldr q16, [x10, #0x30]\n"
+ ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
+ ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
"58:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -866,13 +865,13 @@ void a64_hybrid_u8u32_dot_6x16 (
"82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 83f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 84f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -881,8 +880,8 @@ void a64_hybrid_u8u32_dot_6x16 (
"b 84f\n"
"83:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"84:" // Height 3: input setup done
"cmp x27, #0x10\n"
"blt 87f\n"
@@ -899,75 +898,75 @@ void a64_hybrid_u8u32_dot_6x16 (
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x25, x25, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
"cmp x27, #0x20\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
+ ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x10, #0x50]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 85b\n"
@@ -977,98 +976,98 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q21, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x24, x24, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q20, [x10, #0x30]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x10, #0x40]\n"
+ ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x10, #0x50]\n"
+ ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x60]\n"
+ ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x70]\n"
+ ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
+ "ldr q21, [x10, #0x80]\n"
+ ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
+ "ldr q20, [x10, #0x90]\n"
+ ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xa0]\n"
+ ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xb0]\n"
+ ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
+ ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
+ ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
+ "ldr q21, [x10, #0xc0]\n"
+ ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
+ "ldr q20, [x10, #0xd0]\n"
+ ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
+ "ldr q21, [x10, #0xe0]\n"
+ ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
+ "ldr q20, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
"87:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 92f\n"
"cmp x27, #0x4\n"
"blt 89f\n"
"88:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr q21, [x10, #0x0]\n"
+ ".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n"
+ ".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ ".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ ".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n"
+ ".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n"
+ ".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n"
+ ".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n"
+ ".inst 0x6f97e28f // udot v15.4s, v20.16b, v23.4b[0]\n"
+ ".inst 0x6f96e293 // udot v19.4s, v20.16b, v22.4b[0]\n"
"bge 88b\n"
"89:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 92f\n"
@@ -1086,23 +1085,23 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr b1, [x25, #0x0]\n"
"ldr b2, [x24, #0x0]\n"
"91:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q21, [x10, #0x0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ ".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n"
+ "ldr q21, [x10, #0x20]\n"
+ ".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n"
+ "ldr q20, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
+ ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
+ ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
+ ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
+ ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
"92:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1367,14 +1366,14 @@ void a64_hybrid_u8u32_dot_6x16 (
"116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 117f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 118f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1384,9 +1383,9 @@ void a64_hybrid_u8u32_dot_6x16 (
"b 118f\n"
"117:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"118:" // Height 4: input setup done
"cmp x27, #0x10\n"
"blt 121f\n"
@@ -1405,7 +1404,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x26, x26, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x25, x25, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1413,85 +1412,85 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x23, x23, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"cmp x27, #0x20\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 119b\n"
@@ -1502,7 +1501,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x25, x25, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q25, [x10, #0x20]\n"
"add x24, x24, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1510,112 +1509,112 @@ void a64_hybrid_u8u32_dot_6x16 (
"sub x27, x27, #0x10\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q24, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
+ ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x10, #0x40]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x10, #0x50]\n"
+ ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x60]\n"
+ ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x70]\n"
+ ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
+ "ldr q25, [x10, #0x80]\n"
+ ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
+ "ldr q24, [x10, #0x90]\n"
+ ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xa0]\n"
+ ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xb0]\n"
+ ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
+ "ldr q25, [x10, #0xc0]\n"
+ ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
+ "ldr q24, [x10, #0xd0]\n"
+ ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
+ "ldr q25, [x10, #0xe0]\n"
+ ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
+ "ldr q24, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
"121:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 126f\n"
"cmp x27, #0x4\n"
"blt 123f\n"
"122:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ ".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n"
+ ".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ ".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n"
+ ".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n"
+ ".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n"
+ ".inst 0x6f9de30b // udot v11.4s, v24.16b, v29.4b[0]\n"
+ ".inst 0x6f9ce30f // udot v15.4s, v24.16b, v28.4b[0]\n"
+ ".inst 0x6f9be313 // udot v19.4s, v24.16b, v27.4b[0]\n"
+ ".inst 0x6f9ae317 // udot v23.4s, v24.16b, v26.4b[0]\n"
"bge 122b\n"
"123:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 126f\n"
@@ -1636,27 +1635,27 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr b2, [x24, #0x0]\n"
"ldr b3, [x23, #0x0]\n"
"125:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q24, [x10, #0x10]\n"
+ ".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n"
+ "ldr q25, [x10, #0x20]\n"
+ ".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n"
+ "ldr q24, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
+ ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
+ ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
+ ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
+ ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
+ ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
+ ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
"126:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1968,15 +1967,15 @@ void a64_hybrid_u8u32_dot_6x16 (
"150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 151f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 152f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1987,10 +1986,10 @@ void a64_hybrid_u8u32_dot_6x16 (
"b 152f\n"
"151:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"152:" // Height 5: input setup done
"cmp x27, #0x10\n"
"blt 155f\n"
@@ -2013,7 +2012,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x23, x23, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2022,100 +2021,100 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x27, #0x20\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
"ldr q6, [x10, #0x0]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
"ldr q0, [x26, #0x0]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
"ldr q1, [x25, #0x0]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
"ldr q2, [x24, #0x0]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
"ldr q3, [x23, #0x0]\n"
- ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
"ldr q4, [x22, #0x0]\n"
"ldr q7, [x10, #0x10]\n"
"bge 153b\n"
@@ -2129,7 +2128,7 @@ void a64_hybrid_u8u32_dot_6x16 (
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
+ "ldr q29, [x10, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
"add x22, x22, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2138,131 +2137,131 @@ void a64_hybrid_u8u32_dot_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q28, [x10, #0x30]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x10, #0x80]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x10, #0x90]\n"
- ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xa0]\n"
- ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xb0]\n"
- ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x10, #0xc0]\n"
- ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x10, #0xd0]\n"
- ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x10, #0xe0]\n"
- ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x10, #0xf0]\n"
+ ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x10, #0x40]\n"
+ ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x10, #0x50]\n"
+ ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x60]\n"
+ ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x70]\n"
+ ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
+ "ldr q29, [x10, #0x80]\n"
+ ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
+ "ldr q28, [x10, #0x90]\n"
+ ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xa0]\n"
+ ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xb0]\n"
+ ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
+ ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
+ ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
+ ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
+ ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
+ "ldr q29, [x10, #0xc0]\n"
+ ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
+ ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
+ ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
+ "ldr q28, [x10, #0xd0]\n"
+ ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
+ "ldr q29, [x10, #0xe0]\n"
+ ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
+ "ldr q28, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
- ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
+ ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
+ ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
+ ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
"155:" // Height 5: Multiply loop: Main loop skip
"cbz x27, 160f\n"
"cmp x27, #0x4\n"
"blt 157f\n"
"156:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
"ldr s1, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s0, [x24], #0x4\n"
+ "ldr s31, [x23], #0x4\n"
+ "ldr s30, [x22], #0x4\n"
+ "ldr q29, [x10, #0x0]\n"
+ ".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ ".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ ".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee3ba // udot v26.4s, v29.16b, v30.4b[0]\n"
+ ".inst 0x6f82e38b // udot v11.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f9fe397 // udot v23.4s, v28.16b, v31.4b[0]\n"
+ ".inst 0x6f9ee39b // udot v27.4s, v28.16b, v30.4b[0]\n"
"bge 156b\n"
"157:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 160f\n"
@@ -2286,31 +2285,31 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr b3, [x23, #0x0]\n"
"ldr b4, [x22, #0x0]\n"
"159:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q29, [x10, #0x0]\n"
+ "ldr q28, [x10, #0x10]\n"
+ ".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n"
+ "ldr q29, [x10, #0x20]\n"
+ ".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n"
+ "ldr q28, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
+ ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
+ ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
+ ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
+ ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
+ ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
+ ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
+ ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
+ ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
"160:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2672,16 +2671,16 @@ void a64_hybrid_u8u32_dot_6x16 (
"184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 185f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 186f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2693,11 +2692,11 @@ void a64_hybrid_u8u32_dot_6x16 (
"b 186f\n"
"185:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"186:" // Height 6: input setup done
"cmp x27, #0x10\n"
"blt 189f\n"
@@ -2976,43 +2975,43 @@ void a64_hybrid_u8u32_dot_6x16 (
"cmp x27, #0x4\n"
"blt 191f\n"
"190:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x25], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s6, [x25], #0x4\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "ldr s2, [x24], #0x4\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr s5, [x21], #0x4\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n"
+ ".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n"
+ "ldr q0, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n"
+ ".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n"
+ ".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n"
+ ".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n"
+ ".inst 0x6f83e03a // udot v26.4s, v1.16b, v3.4b[0]\n"
+ ".inst 0x6f82e03e // udot v30.4s, v1.16b, v2.4b[0]\n"
+ ".inst 0x6f87e00b // udot v11.4s, v0.16b, v7.4b[0]\n"
+ ".inst 0x6f86e00f // udot v15.4s, v0.16b, v6.4b[0]\n"
+ ".inst 0x6f85e013 // udot v19.4s, v0.16b, v5.4b[0]\n"
+ ".inst 0x6f84e017 // udot v23.4s, v0.16b, v4.4b[0]\n"
+ ".inst 0x6f83e01b // udot v27.4s, v0.16b, v3.4b[0]\n"
+ ".inst 0x6f82e01f // udot v31.4s, v0.16b, v2.4b[0]\n"
"bge 190b\n"
"191:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 194f\n"
@@ -3039,35 +3038,35 @@ void a64_hybrid_u8u32_dot_6x16 (
"ldr b4, [x22, #0x0]\n"
"ldr b5, [x21, #0x0]\n"
"193:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x10, #0x30]\n"
+ "ldr q7, [x10, #0x0]\n"
+ "ldr q6, [x10, #0x10]\n"
+ ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x10, #0x20]\n"
+ ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
"add x10, x10, #0x40\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fa // udot v26.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fe // udot v30.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0cb // udot v11.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cf // udot v15.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d3 // udot v19.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d7 // udot v23.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0db // udot v27.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0df // udot v31.4s, v6.16b, v5.4b[0]\n"
"194:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3254,7 +3253,6 @@ void a64_hybrid_u8u32_dot_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"206:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index b5cedc7e98..e360452108 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -109,5 +109,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
index dd0c46e4dc..364f388e79 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
@@ -77,7 +77,6 @@ void a64_hybrid_u8u32_mmla_6x16 (
ka.N = N;
ka.B_ptr = B_ptr;
__asm__ __volatile__(
-
"1:" // Row loop
"cmp %x[M], #0x6\n"
"bge 186f\n"
@@ -178,11 +177,11 @@ void a64_hybrid_u8u32_mmla_6x16 (
"15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -198,41 +197,41 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 19f\n"
"18:" // Height 1: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "trn1 v19.2d, v1.2d, v20.2d\n"
+ ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v20.2d\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x6e92a428 // ummla v8.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e91a42c // ummla v12.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"cmp x27, #0x20\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
+ ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
"add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
@@ -240,40 +239,40 @@ void a64_hybrid_u8u32_mmla_6x16 (
"prfm pldl1keep, [x26, #0x80]\n"
"bge 18b\n"
"19:" // Height 1: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- "trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "trn1 v20.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e87a688 // ummla v8.4s, v20.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e86a68c // ummla v12.4s, v20.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e92a689 // ummla v9.4s, v20.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e91a68d // ummla v13.4s, v20.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a68a // ummla v10.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x60]\n"
+ ".inst 0x6e91a68e // ummla v14.4s, v20.16b, v17.16b\n"
+ "ldr q18, [x10, #0x70]\n"
+ "trn2 v1.2d, v1.2d, v21.2d\n"
+ ".inst 0x6e93a68b // ummla v11.4s, v20.16b, v19.16b\n"
+ "ldr q17, [x10, #0x80]\n"
+ ".inst 0x6e92a68f // ummla v15.4s, v20.16b, v18.16b\n"
+ "ldr q19, [x10, #0x90]\n"
+ ".inst 0x6e91a428 // ummla v8.4s, v1.16b, v17.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e93a42c // ummla v12.4s, v1.16b, v19.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
+ ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x10, x10, #0x100\n"
"20:" // Height 1: Multiply loop: Main loop skip
@@ -281,26 +280,26 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 22f\n"
"21:" // Height 1: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr d19, [x26], #0x8\n"
+ "ldr q18, [x10, #0x0]\n"
+ "trn1 v19.2d, v19.2d, v17.2d\n"
+ "ldr q17, [x10, #0x10]\n"
+ ".inst 0x6e92a668 // ummla v8.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e91a66c // ummla v12.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"bge 21b\n"
"22:" // Height 1: Multiply loop: Skip odd blocks
@@ -325,23 +324,23 @@ void a64_hybrid_u8u32_mmla_6x16 (
"25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
"ldr b1, [x26, #0x0]\n"
"26:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
+ "ldr q23, [x10, #0x0]\n"
+ "ldr q18, [x10, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v17.2d\n"
+ ".inst 0x6e97a668 // ummla v8.4s, v19.16b, v23.16b\n"
+ "ldr q17, [x10, #0x20]\n"
+ ".inst 0x6e92a66c // ummla v12.4s, v19.16b, v18.16b\n"
+ "ldr q31, [x10, #0x30]\n"
+ ".inst 0x6e91a669 // ummla v9.4s, v19.16b, v17.16b\n"
+ "ldr q20, [x10, #0x40]\n"
+ ".inst 0x6e9fa66d // ummla v13.4s, v19.16b, v31.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e94a66a // ummla v10.4s, v19.16b, v20.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"27:" // Height 1: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -525,12 +524,12 @@ void a64_hybrid_u8u32_mmla_6x16 (
"52:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 53f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 54f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -538,7 +537,7 @@ void a64_hybrid_u8u32_mmla_6x16 (
"b 54f\n"
"53:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"54:" // Height 2: input setup done
"cmp x27, #0x10\n"
"blt 57f\n"
@@ -549,85 +548,85 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 56f\n"
"55:" // Height 2: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x6e92a428 // ummla v8.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e91a42c // ummla v12.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"sub x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"ldr q2, [x25, #0x0]\n"
"cmp x27, #0x20\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
"add x10, x10, #0x100\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
"ldr q1, [x26, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
"bge 55b\n"
"56:" // Height 2: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n"
+ "ldr q18, [x10, #0x20]\n"
+ ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x80]\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x90]\n"
+ ".inst 0x6e92a428 // ummla v8.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xa0]\n"
+ ".inst 0x6e91a42c // ummla v12.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xb0]\n"
+ ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xc0]\n"
+ ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xd0]\n"
+ ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n"
+ "ldr q18, [x10, #0xe0]\n"
+ ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n"
+ "ldr q17, [x10, #0xf0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n"
+ ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n"
"sub x27, x27, #0x10\n"
"prfm pldl1keep, [x26, #0x80]\n"
"prfm pldl1keep, [x25, #0x80]\n"
@@ -637,27 +636,27 @@ void a64_hybrid_u8u32_mmla_6x16 (
"cmp x27, #0x8\n"
"blt 59f\n"
"58:" // Height 2: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "trn1 v19.2d, v18.2d, v17.2d\n"
"sub x27, x27, #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- "ldr q6, [x10, #0x20]\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- "ldr q6, [x10, #0x40]\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n"
- "ldr q6, [x10, #0x60]\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q17, [x10, #0x0]\n"
+ "ldr q22, [x10, #0x10]\n"
+ ".inst 0x6e91a668 // ummla v8.4s, v19.16b, v17.16b\n"
+ ".inst 0x6e96a66c // ummla v12.4s, v19.16b, v22.16b\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q17, [x10, #0x30]\n"
+ ".inst 0x6e81a669 // ummla v9.4s, v19.16b, v1.16b\n"
+ ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ "ldr q17, [x10, #0x70]\n"
"cmp x27, #0x8\n"
- ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"bge 58b\n"
"59:" // Height 2: Multiply loop: Skip odd blocks
@@ -689,23 +688,23 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr b1, [x26, #0x0]\n"
"ldr b2, [x25, #0x0]\n"
"63:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
+ "ldr q18, [x10, #0x0]\n"
+ "ldr q17, [x10, #0x10]\n"
+ "trn1 v19.2d, v1.2d, v2.2d\n"
+ ".inst 0x6e92a668 // ummla v8.4s, v19.16b, v18.16b\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x6e91a66c // ummla v12.4s, v19.16b, v17.16b\n"
+ "ldr q21, [x10, #0x30]\n"
+ ".inst 0x6e85a669 // ummla v9.4s, v19.16b, v5.16b\n"
+ "ldr q18, [x10, #0x40]\n"
+ ".inst 0x6e95a66d // ummla v13.4s, v19.16b, v21.16b\n"
+ "ldr q17, [x10, #0x50]\n"
+ ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n"
+ "ldr q18, [x10, #0x60]\n"
+ ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n"
+ "ldr q17, [x10, #0x70]\n"
+ ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n"
+ ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n"
"add x10, x10, #0x80\n"
"64:" // Height 2: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -953,13 +952,13 @@ void a64_hybrid_u8u32_mmla_6x16 (
"89:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 90f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 91f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -968,8 +967,8 @@ void a64_hybrid_u8u32_mmla_6x16 (
"b 91f\n"
"90:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"91:" // Height 3: input setup done
"cmp x27, #0x10\n"
"blt 94f\n"
@@ -981,167 +980,167 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 93f\n"
"92:" // Height 3: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
"cmp x27, #0x20\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"bge 92b\n"
"93:" // Height 3: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- "trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ "trn2 v3.2d, v3.2d, v29.2d\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
+ ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n"
+ ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n"
"94:" // Height 3: Multiply loop: Main loop skip
"cbz x27, 101f\n"
"cmp x27, #0x8\n"
"blt 96f\n"
"95:" // Height 3: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr q26, [x10, #0x0]\n"
+ "trn1 v27.2d, v25.2d, v27.2d\n"
+ ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
"cmp x27, #0x8\n"
- ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"bge 95b\n"
"96:" // Height 3: Multiply loop: Skip odd blocks
"cbz x27, 101f\n"
@@ -1179,33 +1178,33 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr b2, [x25, #0x0]\n"
"ldr b3, [x24, #0x0]\n"
"100:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q29, [x10, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v25.2d\n"
+ ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e9da78c // ummla v12.4s, v28.16b, v29.16b\n"
+ ".inst 0x6e9da774 // ummla v20.4s, v27.16b, v29.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"101:" // Height 3: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1499,14 +1498,14 @@ void a64_hybrid_u8u32_mmla_6x16 (
"126:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 127f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 128f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1516,9 +1515,9 @@ void a64_hybrid_u8u32_mmla_6x16 (
"b 128f\n"
"127:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"128:" // Height 4: input setup done
"cmp x27, #0x10\n"
"blt 131f\n"
@@ -1531,173 +1530,173 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q6, [x10, #0x10]\n"
"blt 130f\n"
"129:" // Height 4: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
"add x23, x23, #0x10\n"
"ldr q4, [x23, #0x0]\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
"cmp x27, #0x20\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n"
"ldr q3, [x24, #0x0]\n"
"ldr q6, [x10, #0x10]\n"
"bge 129b\n"
"130:" // Height 4: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n"
+ ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x80]\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
- ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x90]\n"
+ ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xa0]\n"
+ ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xb0]\n"
+ ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xc0]\n"
+ ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xd0]\n"
+ ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n"
+ "ldr q26, [x10, #0xe0]\n"
+ ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n"
+ "ldr q25, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n"
+ ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n"
+ ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n"
+ ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n"
"131:" // Height 4: Multiply loop: Main loop skip
"cbz x27, 138f\n"
"cmp x27, #0x8\n"
"blt 133f\n"
"132:" // Height 4: Multiply loop: Odd block loop
- "ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "trn1 v28.2d, v26.2d, v25.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d26, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "trn1 v27.2d, v26.2d, v25.2d\n"
"cmp x27, #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x60]\n"
- ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
+ ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"bge 132b\n"
"133:" // Height 4: Multiply loop: Skip odd blocks
"cbz x27, 138f\n"
@@ -1742,33 +1741,33 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr b3, [x24, #0x0]\n"
"ldr b4, [x23, #0x0]\n"
"137:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "ldr q6, [x10, #0x10]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x10, #0x10]\n"
+ "trn1 v28.2d, v1.2d, v2.2d\n"
+ "trn1 v27.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x20]\n"
+ ".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x30]\n"
+ ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x40]\n"
+ ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x50]\n"
+ ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n"
+ "ldr q26, [x10, #0x60]\n"
+ ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n"
+ "ldr q25, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n"
+ ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n"
+ ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n"
+ ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n"
"138:" // Height 4: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2125,15 +2124,15 @@ void a64_hybrid_u8u32_mmla_6x16 (
"163:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 164f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 165f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2144,10 +2143,10 @@ void a64_hybrid_u8u32_mmla_6x16 (
"b 165f\n"
"164:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"165:" // Height 5: input setup done
"cmp x27, #0x10\n"
"blt 168f\n"
@@ -2160,174 +2159,174 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q7, [x10, #0x0]\n"
"blt 167f\n"
"166:" // Height 5: Multiply loop: Main loop head
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
"sub x27, x27, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4cc // ummla v12.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a454 // ummla v20.4s, v2.16b, v0.16b\n"
"add x26, x26, #0x10\n"
- ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a49c // ummla v28.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e87a4c9 // ummla v9.4s, v6.16b, v7.16b\n"
"add x25, x25, #0x10\n"
".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4cd // ummla v13.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a455 // ummla v21.4s, v2.16b, v0.16b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e87a4ca // ummla v10.4s, v6.16b, v7.16b\n"
"cmp x27, #0x20\n"
".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4ce // ummla v14.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a456 // ummla v22.4s, v2.16b, v0.16b\n"
"prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a49e // ummla v30.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x6e87a4cb // ummla v11.4s, v6.16b, v7.16b\n"
"prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4cf // ummla v15.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a457 // ummla v23.4s, v2.16b, v0.16b\n"
"ldr q2, [x25, #0x0]\n"
- ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "ldr q6, [x10, #0xa0]\n"
+ ".inst 0x6e80a42c // ummla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4bc // ummla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xb0]\n"
+ ".inst 0x6e86a429 // ummla v9.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a471 // ummla v17.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a4b9 // ummla v25.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xc0]\n"
+ ".inst 0x6e80a42d // ummla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4bd // ummla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xd0]\n"
+ ".inst 0x6e86a42a // ummla v10.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a472 // ummla v18.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a4ba // ummla v26.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xe0]\n"
+ ".inst 0x6e80a42e // ummla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4be // ummla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x6e86a42b // ummla v11.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a4bb // ummla v27.4s, v5.16b, v6.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e80a42f // ummla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x6e80a4bf // ummla v31.4s, v5.16b, v0.16b\n"
"ldr q5, [x22, #0x0]\n"
"bge 166b\n"
"167:" // Height 5: Multiply loop: Single iteration only
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "trn1 v6.2d, v1.2d, v2.2d\n"
"trn2 v1.2d, v1.2d, v2.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n"
"trn1 v2.2d, v3.2d, v4.2d\n"
"trn2 v3.2d, v3.2d, v4.2d\n"
".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
"add x26, x26, #0x10\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "trn2 v5.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
+ "trn1 v4.2d, v5.2d, v0.2d\n"
+ "trn2 v5.2d, v5.2d, v0.2d\n"
+ "ldr q0, [x10, #0x10]\n"
".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4cc // ummla v12.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a454 // ummla v20.4s, v2.16b, v0.16b\n"
"add x25, x25, #0x10\n"
- ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a49c // ummla v28.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e87a4c9 // ummla v9.4s, v6.16b, v7.16b\n"
"add x24, x24, #0x10\n"
".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x40]\n"
"add x23, x23, #0x10\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4cd // ummla v13.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a455 // ummla v21.4s, v2.16b, v0.16b\n"
"add x22, x22, #0x10\n"
"sub x27, x27, #0x10\n"
- ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e87a4ca // ummla v10.4s, v6.16b, v7.16b\n"
"prfm pldl1keep, [x26, #0x80]\n"
".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x60]\n"
"prfm pldl1keep, [x25, #0x80]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a4ce // ummla v14.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a456 // ummla v22.4s, v2.16b, v0.16b\n"
"prfm pldl1keep, [x24, #0x80]\n"
"prfm pldl1keep, [x23, #0x80]\n"
- ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a49e // ummla v30.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x6e87a4cb // ummla v11.4s, v6.16b, v7.16b\n"
"prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
"ldr q7, [x10, #0x80]\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6e80a4cf // ummla v15.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e80a457 // ummla v23.4s, v2.16b, v0.16b\n"
+ ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n"
+ "ldr q0, [x10, #0xa0]\n"
+ ".inst 0x6e82a42c // ummla v12.4s, v1.16b, v2.16b\n"
+ ".inst 0x6e82a474 // ummla v20.4s, v3.16b, v2.16b\n"
+ ".inst 0x6e82a4bc // ummla v28.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xb0]\n"
+ ".inst 0x6e80a429 // ummla v9.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xc0]\n"
+ ".inst 0x6e82a42d // ummla v13.4s, v1.16b, v2.16b\n"
+ ".inst 0x6e82a475 // ummla v21.4s, v3.16b, v2.16b\n"
+ ".inst 0x6e82a4bd // ummla v29.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xd0]\n"
+ ".inst 0x6e80a42a // ummla v10.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4ba // ummla v26.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xe0]\n"
+ ".inst 0x6e82a42e // ummla v14.4s, v1.16b, v2.16b\n"
+ ".inst 0x6e82a476 // ummla v22.4s, v3.16b, v2.16b\n"
+ ".inst 0x6e82a4be // ummla v30.4s, v5.16b, v2.16b\n"
"ldr q6, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x6e80a42b // ummla v11.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4bb // ummla v27.4s, v5.16b, v0.16b\n"
".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n"
@@ -2337,48 +2336,48 @@ void a64_hybrid_u8u32_mmla_6x16 (
"blt 170f\n"
"169:" // Height 5: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d5, [x22], #0x8\n"
- "ldr q6, [x10, #0x0]\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a498 // ummla v24.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
+ "ldr d0, [x22], #0x8\n"
+ "ldr q1, [x10, #0x0]\n"
+ "trn1 v2.2d, v0.2d, v2.2d\n"
+ ".inst 0x6e81a488 // ummla v8.4s, v4.16b, v1.16b\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e81a470 // ummla v16.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x6e80a48c // ummla v12.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n"
"cmp x27, #0x8\n"
- ".inst 0x6e87a49c // ummla v28.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a499 // ummla v25.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49d // ummla v29.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49a // ummla v26.4s, v4.16b, v6.16b\n"
+ ".inst 0x6e80a45c // ummla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e81a489 // ummla v9.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e81a471 // ummla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x40]\n"
+ ".inst 0x6e80a48d // ummla v13.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45d // ummla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e81a48a // ummla v10.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e81a472 // ummla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45a // ummla v26.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49e // ummla v30.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
- ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e80a48e // ummla v14.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45e // ummla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
+ ".inst 0x6e86a48b // ummla v11.4s, v4.16b, v6.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49b // ummla v27.4s, v4.16b, v6.16b\n"
- ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49f // ummla v31.4s, v4.16b, v7.16b\n"
+ ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a45b // ummla v27.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a48f // ummla v15.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45f // ummla v31.4s, v2.16b, v0.16b\n"
"bge 169b\n"
"170:" // Height 5: Multiply loop: Skip odd blocks
"cbz x27, 175f\n"
@@ -2430,42 +2429,42 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr b4, [x23, #0x0]\n"
"ldr b5, [x22, #0x0]\n"
"174:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n"
+ "ldr q6, [x10, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ "trn1 v2.2d, v5.2d, v0.2d\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x6e86a4e8 // ummla v8.4s, v7.16b, v6.16b\n"
+ ".inst 0x6e86a470 // ummla v16.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x6e81a4ec // ummla v12.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e81a474 // ummla v20.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e80a4e9 // ummla v9.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a459 // ummla v25.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x40]\n"
+ ".inst 0x6e81a4ed // ummla v13.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e81a475 // ummla v21.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45d // ummla v29.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e80a4ea // ummla v10.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45a // ummla v26.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x60]\n"
+ ".inst 0x6e81a4ee // ummla v14.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e81a476 // ummla v22.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45e // ummla v30.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
+ ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45b // ummla v27.4s, v2.16b, v0.16b\n"
+ ".inst 0x6e86a4ef // ummla v15.4s, v7.16b, v6.16b\n"
+ ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a45f // ummla v31.4s, v2.16b, v6.16b\n"
"175:" // Height 5: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2872,16 +2871,16 @@ void a64_hybrid_u8u32_mmla_6x16 (
"200:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 201f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 202f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2893,11 +2892,11 @@ void a64_hybrid_u8u32_mmla_6x16 (
"b 202f\n"
"201:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"202:" // Height 6: input setup done
"cmp x27, #0x10\n"
"blt 205f\n"
@@ -2964,42 +2963,42 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr q2, [x25, #0x0]\n"
"prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ "ldr q0, [x10, #0x90]\n"
"ldr q4, [x23, #0x0]\n"
".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xf0]\n"
+ "ldr q6, [x10, #0xa0]\n"
+ ".inst 0x6e80a42c // ummla v12.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4bc // ummla v28.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xb0]\n"
+ ".inst 0x6e86a429 // ummla v9.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a471 // ummla v17.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a4b9 // ummla v25.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xc0]\n"
+ ".inst 0x6e80a42d // ummla v13.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4bd // ummla v29.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xd0]\n"
+ ".inst 0x6e86a42a // ummla v10.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a472 // ummla v18.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a4ba // ummla v26.4s, v5.16b, v6.16b\n"
+ "ldr q6, [x10, #0xe0]\n"
+ ".inst 0x6e80a42e // ummla v14.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4be // ummla v30.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x6e86a42b // ummla v11.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a4bb // ummla v27.4s, v5.16b, v6.16b\n"
"ldr q7, [x10, #0x0]\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e80a42f // ummla v15.4s, v1.16b, v0.16b\n"
"ldr q1, [x26, #0x0]\n"
- ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n"
"ldr q3, [x24, #0x0]\n"
- ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n"
+ ".inst 0x6e80a4bf // ummla v31.4s, v5.16b, v0.16b\n"
"ldr q5, [x22, #0x0]\n"
"ldr q6, [x21, #0x0]\n"
"bge 203b\n"
@@ -3055,35 +3054,35 @@ void a64_hybrid_u8u32_mmla_6x16 (
".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x90]\n"
+ "ldr q2, [x10, #0x90]\n"
".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n"
".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n"
".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xa0]\n"
- ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xb0]\n"
- ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xc0]\n"
- ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n"
- "ldr q6, [x10, #0xd0]\n"
- ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n"
- "ldr q7, [x10, #0xe0]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n"
- ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n"
+ "ldr q0, [x10, #0xa0]\n"
+ ".inst 0x6e82a42c // ummla v12.4s, v1.16b, v2.16b\n"
+ ".inst 0x6e82a474 // ummla v20.4s, v3.16b, v2.16b\n"
+ ".inst 0x6e82a4bc // ummla v28.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xb0]\n"
+ ".inst 0x6e80a429 // ummla v9.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xc0]\n"
+ ".inst 0x6e82a42d // ummla v13.4s, v1.16b, v2.16b\n"
+ ".inst 0x6e82a475 // ummla v21.4s, v3.16b, v2.16b\n"
+ ".inst 0x6e82a4bd // ummla v29.4s, v5.16b, v2.16b\n"
+ "ldr q2, [x10, #0xd0]\n"
+ ".inst 0x6e80a42a // ummla v10.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4ba // ummla v26.4s, v5.16b, v0.16b\n"
+ "ldr q0, [x10, #0xe0]\n"
+ ".inst 0x6e82a42e // ummla v14.4s, v1.16b, v2.16b\n"
+ ".inst 0x6e82a476 // ummla v22.4s, v3.16b, v2.16b\n"
+ ".inst 0x6e82a4be // ummla v30.4s, v5.16b, v2.16b\n"
"ldr q6, [x10, #0xf0]\n"
"add x10, x10, #0x100\n"
- ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n"
- ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n"
- ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n"
+ ".inst 0x6e80a42b // ummla v11.4s, v1.16b, v0.16b\n"
+ ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a4bb // ummla v27.4s, v5.16b, v0.16b\n"
".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n"
@@ -3093,49 +3092,49 @@ void a64_hybrid_u8u32_mmla_6x16 (
"blt 207f\n"
"206:" // Height 6: Multiply loop: Odd block loop
"ldr d1, [x26], #0x8\n"
- "ldr d2, [x25], #0x8\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
+ "ldr d0, [x25], #0x8\n"
+ "trn1 v4.2d, v1.2d, v0.2d\n"
"sub x27, x27, #0x8\n"
- "ldr d3, [x24], #0x8\n"
- "ldr d4, [x23], #0x8\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
+ "ldr d1, [x24], #0x8\n"
+ "ldr d0, [x23], #0x8\n"
+ "trn1 v3.2d, v1.2d, v0.2d\n"
"cmp x27, #0x8\n"
- "ldr d5, [x22], #0x8\n"
- "ldr d7, [x21], #0x8\n"
- "trn1 v4.2d, v5.2d, v7.2d\n"
- "ldr q6, [x10, #0x0]\n"
- "ldr q7, [x10, #0x10]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a498 // ummla v24.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x20]\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49c // ummla v28.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a499 // ummla v25.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x40]\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49d // ummla v29.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x50]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49a // ummla v26.4s, v4.16b, v6.16b\n"
+ "ldr d1, [x22], #0x8\n"
+ "ldr d0, [x21], #0x8\n"
+ "trn1 v2.2d, v1.2d, v0.2d\n"
+ "ldr q1, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ ".inst 0x6e81a488 // ummla v8.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e81a470 // ummla v16.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x20]\n"
+ ".inst 0x6e80a48c // ummla v12.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45c // ummla v28.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x30]\n"
+ ".inst 0x6e81a489 // ummla v9.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e81a471 // ummla v17.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x40]\n"
+ ".inst 0x6e80a48d // ummla v13.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45d // ummla v29.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x50]\n"
+ ".inst 0x6e81a48a // ummla v10.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e81a472 // ummla v18.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45a // ummla v26.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x60]\n"
- ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49e // ummla v30.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x70]\n"
+ ".inst 0x6e80a48e // ummla v14.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45e // ummla v30.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x70]\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49b // ummla v27.4s, v4.16b, v6.16b\n"
- ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49f // ummla v31.4s, v4.16b, v7.16b\n"
+ ".inst 0x6e86a48b // ummla v11.4s, v4.16b, v6.16b\n"
+ ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a45b // ummla v27.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e80a48f // ummla v15.4s, v4.16b, v0.16b\n"
+ ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45f // ummla v31.4s, v2.16b, v0.16b\n"
"bge 206b\n"
"207:" // Height 6: Multiply loop: Skip odd blocks
"cbz x27, 212f\n"
@@ -3194,42 +3193,42 @@ void a64_hybrid_u8u32_mmla_6x16 (
"ldr b5, [x22, #0x0]\n"
"ldr b6, [x21, #0x0]\n"
"211:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q7, [x10, #0x0]\n"
- "trn1 v0.2d, v1.2d, v2.2d\n"
- "trn1 v2.2d, v3.2d, v4.2d\n"
- ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
- "trn1 v4.2d, v5.2d, v6.2d\n"
- "ldr q6, [x10, #0x10]\n"
- ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x20]\n"
- ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x30]\n"
- ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x40]\n"
- ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n"
- "ldr q6, [x10, #0x50]\n"
- ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
- ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n"
- "ldr q7, [x10, #0x60]\n"
- ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n"
+ "ldr q0, [x10, #0x0]\n"
+ "trn1 v7.2d, v1.2d, v2.2d\n"
+ "trn1 v3.2d, v3.2d, v4.2d\n"
+ ".inst 0x6e80a4e8 // ummla v8.4s, v7.16b, v0.16b\n"
+ "trn1 v2.2d, v5.2d, v6.2d\n"
+ "ldr q1, [x10, #0x10]\n"
+ ".inst 0x6e80a470 // ummla v16.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a458 // ummla v24.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x20]\n"
+ ".inst 0x6e81a4ec // ummla v12.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e81a474 // ummla v20.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x30]\n"
+ ".inst 0x6e80a4e9 // ummla v9.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a459 // ummla v25.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x40]\n"
+ ".inst 0x6e81a4ed // ummla v13.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e81a475 // ummla v21.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45d // ummla v29.4s, v2.16b, v1.16b\n"
+ "ldr q1, [x10, #0x50]\n"
+ ".inst 0x6e80a4ea // ummla v10.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45a // ummla v26.4s, v2.16b, v0.16b\n"
+ "ldr q0, [x10, #0x60]\n"
+ ".inst 0x6e81a4ee // ummla v14.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e81a476 // ummla v22.4s, v3.16b, v1.16b\n"
+ ".inst 0x6e81a45e // ummla v30.4s, v2.16b, v1.16b\n"
"ldr q6, [x10, #0x70]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
"add x10, x10, #0x80\n"
- ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n"
- ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n"
- ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n"
+ ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n"
+ ".inst 0x6e80a45b // ummla v27.4s, v2.16b, v0.16b\n"
+ ".inst 0x6e86a4ef // ummla v15.4s, v7.16b, v6.16b\n"
+ ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e86a45f // ummla v31.4s, v2.16b, v6.16b\n"
"212:" // Height 6: Multiply loop: No odd multiplies
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3440,7 +3439,6 @@ void a64_hybrid_u8u32_mmla_6x16 (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"224:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 153a4cc167..25c5bf1b44 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return 12;
}
- static unsigned int stripe_width()
- {
- return 4;
- }
-
static constexpr unsigned int k_unroll()
{
return 2;
@@ -97,5 +92,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index b3bde74635..5684f464b6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_bf16fp32_dot_8x12(
- const bfloat16 *Apanel, const bfloat16 *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *Apanel,
+ const bfloat16 *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_bf16fp32_dot_8x12(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -88,8 +91,8 @@ void a64_interleaved_bf16fp32_dot_8x12(
"movi v31.16b, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
+ "ldr q3, [%x[Apanel], #0x20]\n"
+ "ldr q7, [%x[Apanel], #0x30]\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
".inst 0x4f60f08b // bfdot v11.4s, v4.8h, v0.h[1]\n"
".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n"
@@ -123,35 +126,35 @@ void a64_interleaved_bf16fp32_dot_8x12(
".inst 0x4f61f0d9 // bfdot v25.4s, v6.8h, v1.h[1]\n"
".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n"
".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [x22, #0x50]\n"
+ "ldr q2, [x22, #0x50]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
"add x22, x22, #0x60\n"
- ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
- ".inst 0x4f62f08b // bfdot v11.4s, v4.8h, v2.h[1]\n"
- ".inst 0x4f42f88e // bfdot v14.4s, v4.8h, v2.h[2]\n"
- ".inst 0x4f62f891 // bfdot v17.4s, v4.8h, v2.h[3]\n"
- ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
- ".inst 0x4f63f097 // bfdot v23.4s, v4.8h, v3.h[1]\n"
- ".inst 0x4f43f89a // bfdot v26.4s, v4.8h, v3.h[2]\n"
- ".inst 0x4f63f89d // bfdot v29.4s, v4.8h, v3.h[3]\n"
+ ".inst 0x4f43f088 // bfdot v8.4s, v4.8h, v3.h[0]\n"
+ ".inst 0x4f63f08b // bfdot v11.4s, v4.8h, v3.h[1]\n"
+ ".inst 0x4f43f88e // bfdot v14.4s, v4.8h, v3.h[2]\n"
+ ".inst 0x4f63f891 // bfdot v17.4s, v4.8h, v3.h[3]\n"
+ ".inst 0x4f47f094 // bfdot v20.4s, v4.8h, v7.h[0]\n"
+ ".inst 0x4f67f097 // bfdot v23.4s, v4.8h, v7.h[1]\n"
+ ".inst 0x4f47f89a // bfdot v26.4s, v4.8h, v7.h[2]\n"
+ ".inst 0x4f67f89d // bfdot v29.4s, v4.8h, v7.h[3]\n"
"ldr q4, [x22, #0x0]\n"
- ".inst 0x4f42f0a9 // bfdot v9.4s, v5.8h, v2.h[0]\n"
- ".inst 0x4f62f0ac // bfdot v12.4s, v5.8h, v2.h[1]\n"
- ".inst 0x4f42f8af // bfdot v15.4s, v5.8h, v2.h[2]\n"
- ".inst 0x4f62f8b2 // bfdot v18.4s, v5.8h, v2.h[3]\n"
- ".inst 0x4f43f0b5 // bfdot v21.4s, v5.8h, v3.h[0]\n"
- ".inst 0x4f63f0b8 // bfdot v24.4s, v5.8h, v3.h[1]\n"
- ".inst 0x4f43f8bb // bfdot v27.4s, v5.8h, v3.h[2]\n"
- ".inst 0x4f63f8be // bfdot v30.4s, v5.8h, v3.h[3]\n"
+ ".inst 0x4f43f0a9 // bfdot v9.4s, v5.8h, v3.h[0]\n"
+ ".inst 0x4f63f0ac // bfdot v12.4s, v5.8h, v3.h[1]\n"
+ ".inst 0x4f43f8af // bfdot v15.4s, v5.8h, v3.h[2]\n"
+ ".inst 0x4f63f8b2 // bfdot v18.4s, v5.8h, v3.h[3]\n"
+ ".inst 0x4f47f0b5 // bfdot v21.4s, v5.8h, v7.h[0]\n"
+ ".inst 0x4f67f0b8 // bfdot v24.4s, v5.8h, v7.h[1]\n"
+ ".inst 0x4f47f8bb // bfdot v27.4s, v5.8h, v7.h[2]\n"
+ ".inst 0x4f67f8be // bfdot v30.4s, v5.8h, v7.h[3]\n"
"ldr q5, [x22, #0x10]\n"
- ".inst 0x4f42f0ca // bfdot v10.4s, v6.8h, v2.h[0]\n"
- ".inst 0x4f62f0cd // bfdot v13.4s, v6.8h, v2.h[1]\n"
- ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
- ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- ".inst 0x4f63f0d9 // bfdot v25.4s, v6.8h, v3.h[1]\n"
- ".inst 0x4f43f8dc // bfdot v28.4s, v6.8h, v3.h[2]\n"
- ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f43f04a // bfdot v10.4s, v2.8h, v3.h[0]\n"
+ ".inst 0x4f63f04d // bfdot v13.4s, v2.8h, v3.h[1]\n"
+ ".inst 0x4f43f850 // bfdot v16.4s, v2.8h, v3.h[2]\n"
+ ".inst 0x4f63f853 // bfdot v19.4s, v2.8h, v3.h[3]\n"
+ ".inst 0x4f47f056 // bfdot v22.4s, v2.8h, v7.h[0]\n"
+ ".inst 0x4f67f059 // bfdot v25.4s, v2.8h, v7.h[1]\n"
+ ".inst 0x4f47f85c // bfdot v28.4s, v2.8h, v7.h[2]\n"
+ ".inst 0x4f67f85f // bfdot v31.4s, v2.8h, v7.h[3]\n"
"ldr q6, [x22, #0x20]\n"
"bge 3b\n"
"4:" // main loop skip
@@ -182,37 +185,37 @@ void a64_interleaved_bf16fp32_dot_8x12(
".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n"
".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
"cbz x20, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "ldr q4, [%x[Apanel], #0x0]\n"
+ "ldr q3, [%x[Apanel], #0x10]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x22, #0x0]\n"
- "ldr q4, [x22, #0x10]\n"
- ".inst 0x4f40f0e8 // bfdot v8.4s, v7.8h, v0.h[0]\n"
- "ldr q5, [x22, #0x20]\n"
- ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- ".inst 0x4f40f8ee // bfdot v14.4s, v7.8h, v0.h[2]\n"
- ".inst 0x4f60f8f1 // bfdot v17.4s, v7.8h, v0.h[3]\n"
- ".inst 0x4f41f0f4 // bfdot v20.4s, v7.8h, v1.h[0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q1, [x22, #0x10]\n"
+ ".inst 0x4f44f048 // bfdot v8.4s, v2.8h, v4.h[0]\n"
+ "ldr q0, [x22, #0x20]\n"
+ ".inst 0x4f64f04b // bfdot v11.4s, v2.8h, v4.h[1]\n"
+ ".inst 0x4f44f84e // bfdot v14.4s, v2.8h, v4.h[2]\n"
+ ".inst 0x4f64f851 // bfdot v17.4s, v2.8h, v4.h[3]\n"
+ ".inst 0x4f43f054 // bfdot v20.4s, v2.8h, v3.h[0]\n"
"add x22, x22, #0x30\n"
- ".inst 0x4f61f0f7 // bfdot v23.4s, v7.8h, v1.h[1]\n"
- ".inst 0x4f41f8fa // bfdot v26.4s, v7.8h, v1.h[2]\n"
- ".inst 0x4f61f8fd // bfdot v29.4s, v7.8h, v1.h[3]\n"
- ".inst 0x4f40f089 // bfdot v9.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f08c // bfdot v12.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f88f // bfdot v15.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f892 // bfdot v18.4s, v4.8h, v0.h[3]\n"
- ".inst 0x4f41f095 // bfdot v21.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f098 // bfdot v24.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89b // bfdot v27.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89e // bfdot v30.4s, v4.8h, v1.h[3]\n"
- ".inst 0x4f40f0aa // bfdot v10.4s, v5.8h, v0.h[0]\n"
- ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
- ".inst 0x4f40f8b0 // bfdot v16.4s, v5.8h, v0.h[2]\n"
- ".inst 0x4f60f8b3 // bfdot v19.4s, v5.8h, v0.h[3]\n"
- ".inst 0x4f41f0b6 // bfdot v22.4s, v5.8h, v1.h[0]\n"
- ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
- ".inst 0x4f41f8bc // bfdot v28.4s, v5.8h, v1.h[2]\n"
- ".inst 0x4f61f8bf // bfdot v31.4s, v5.8h, v1.h[3]\n"
+ ".inst 0x4f63f057 // bfdot v23.4s, v2.8h, v3.h[1]\n"
+ ".inst 0x4f43f85a // bfdot v26.4s, v2.8h, v3.h[2]\n"
+ ".inst 0x4f63f85d // bfdot v29.4s, v2.8h, v3.h[3]\n"
+ ".inst 0x4f44f029 // bfdot v9.4s, v1.8h, v4.h[0]\n"
+ ".inst 0x4f64f02c // bfdot v12.4s, v1.8h, v4.h[1]\n"
+ ".inst 0x4f44f82f // bfdot v15.4s, v1.8h, v4.h[2]\n"
+ ".inst 0x4f64f832 // bfdot v18.4s, v1.8h, v4.h[3]\n"
+ ".inst 0x4f43f035 // bfdot v21.4s, v1.8h, v3.h[0]\n"
+ ".inst 0x4f63f038 // bfdot v24.4s, v1.8h, v3.h[1]\n"
+ ".inst 0x4f43f83b // bfdot v27.4s, v1.8h, v3.h[2]\n"
+ ".inst 0x4f63f83e // bfdot v30.4s, v1.8h, v3.h[3]\n"
+ ".inst 0x4f44f00a // bfdot v10.4s, v0.8h, v4.h[0]\n"
+ ".inst 0x4f64f00d // bfdot v13.4s, v0.8h, v4.h[1]\n"
+ ".inst 0x4f44f810 // bfdot v16.4s, v0.8h, v4.h[2]\n"
+ ".inst 0x4f64f813 // bfdot v19.4s, v0.8h, v4.h[3]\n"
+ ".inst 0x4f43f016 // bfdot v22.4s, v0.8h, v3.h[0]\n"
+ ".inst 0x4f63f019 // bfdot v25.4s, v0.8h, v3.h[1]\n"
+ ".inst 0x4f43f81c // bfdot v28.4s, v0.8h, v3.h[2]\n"
+ ".inst 0x4f63f81f // bfdot v31.4s, v0.8h, v3.h[3]\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
"str q8, [%x[Cpanel], #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 17c93faca2..66c2b92a34 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -57,11 +57,6 @@ public:
return 12;
}
- static unsigned int stripe_width()
- {
- return 4;
- }
-
static constexpr unsigned int k_unroll()
{
return 4;
@@ -117,5 +112,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
index cba29bc572..bab687a9b4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_bf16fp32_mmla_8x12_a510(
- const bfloat16 *Apanel, const bfloat16 *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *Apanel,
+ const bfloat16 *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -82,28 +85,28 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
"movi v31.16b, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
- "ldp q6, q7, [x22], #0x20\n"
+ "ldp q3, q7, [x22], #0x20\n"
".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"sub x20, x20, #0x2\n"
".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n"
"cmp x20, #0x2\n"
- ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e43ec2f // bfmmla v15.4s, v1.8h, v3.8h\n"
".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n"
".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
- "ldp q6, q7, [x22], #0x20\n"
+ ".inst 0x6e43ecdb // bfmmla v27.4s, v6.8h, v3.8h\n"
+ ".inst 0x6e47ecde // bfmmla v30.4s, v6.8h, v7.8h\n"
+ "ldp q7, q3, [x22], #0x20\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
"ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
@@ -113,39 +116,39 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
"ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
- ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
- "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n"
+ ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n"
+ "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
- "ldp q6, q7, [x22], #0x20\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n"
+ ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n"
+ "ldp q7, q3, [x22], #0x20\n"
".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
- ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e44ecdb // bfmmla v27.4s, v6.8h, v4.8h\n"
+ ".inst 0x6e45ecde // bfmmla v30.4s, v6.8h, v5.8h\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
"ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n"
"ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
"ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n"
+ ".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n"
"bge 3b\n"
"4:" // main loop skip
"ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
@@ -158,7 +161,7 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
- "ldp q4, q5, [x22], #0x20\n"
+ "ldp q5, q4, [x22], #0x20\n"
".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
@@ -167,93 +170,93 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510(
".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
- ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
- ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
- ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
- ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
- ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec7c // bfmmla v28.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e44ec7f // bfmmla v31.4s, v3.8h, v4.8h\n"
"cbz x20, 5f\n"
- "ldp q6, q7, [x22], #0x20\n"
- "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
- "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
- "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
- "ldp q6, q7, [x22], #0x20\n"
- ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
- ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
- ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
- ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
- ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ "ldp q1, q0, [x22], #0x20\n"
+ "ld1 { v7.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
+ "ld1 { v5.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ "ldp q3, q2, [x22], #0x20\n"
+ ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
+ ".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n"
+ ".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n"
+ ".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldp q1, q0, [x22], #0x20\n"
+ ".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n"
+ ".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n"
+ ".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n"
+ ".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n"
+ ".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n"
+ ".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e42ec9e // bfmmla v30.4s, v4.8h, v2.8h\n"
+ ".inst 0x6e41ecea // bfmmla v10.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e40eced // bfmmla v13.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e41ecd0 // bfmmla v16.4s, v6.8h, v1.8h\n"
+ ".inst 0x6e40ecd3 // bfmmla v19.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e41ecb6 // bfmmla v22.4s, v5.8h, v1.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ ".inst 0x6e41ec9c // bfmmla v28.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index 2938639048..8485820c7c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_bf16fp32_mmla_8x12(
- const bfloat16 *Apanel, const bfloat16 *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *Apanel,
+ const bfloat16 *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_bf16fp32_mmla_8x12(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -85,31 +88,31 @@ void a64_interleaved_bf16fp32_mmla_8x12(
"movi v31.16b, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ldr q3, [%x[Apanel], #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
+ "ldr q6, [%x[Apanel], #0x0]\n"
+ "ldr q7, [x22, #0x0]\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "ldr q7, [x22, #0x10]\n"
+ "ldr q3, [x22, #0x10]\n"
".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"sub x20, x20, #0x2\n"
".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n"
"ldr q4, [x22, #0x20]\n"
- ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n"
"ldr q5, [x22, #0x30]\n"
- ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0c // bfmmla v12.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
"cmp x20, #0x2\n"
- ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
- "ldr q6, [x22, #0x40]\n"
- ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
- "ldr q7, [x22, #0x50]\n"
+ ".inst 0x6e43ec32 // bfmmla v18.4s, v1.8h, v3.8h\n"
+ ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec58 // bfmmla v24.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e47ecdb // bfmmla v27.4s, v6.8h, v7.8h\n"
+ "ldr q7, [x22, #0x40]\n"
+ ".inst 0x6e43ecde // bfmmla v30.4s, v6.8h, v3.8h\n"
+ "ldr q3, [x22, #0x50]\n"
".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
"ldr q0, [%x[Apanel], #0x10]\n"
@@ -119,42 +122,42 @@ void a64_interleaved_bf16fp32_mmla_8x12(
".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
"ldr q2, [%x[Apanel], #0x30]\n"
- ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n"
"ldr q4, [x22, #0x60]\n"
- ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
- "ldr q3, [%x[Apanel], #0x40]\n"
+ ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n"
+ "ldr q6, [%x[Apanel], #0x40]\n"
"ldr q5, [x22, #0x70]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
- "ldr q6, [x22, #0x80]\n"
- ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
- "ldr q7, [x22, #0x90]\n"
+ ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n"
+ ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n"
+ ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n"
+ ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n"
+ "ldr q7, [x22, #0x80]\n"
+ ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n"
+ "ldr q3, [x22, #0x90]\n"
".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e44ecdb // bfmmla v27.4s, v6.8h, v4.8h\n"
"ldr q4, [x22, #0xa0]\n"
- ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e45ecde // bfmmla v30.4s, v6.8h, v5.8h\n"
"ldr q5, [x22, #0xb0]\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n"
"ldr q0, [%x[Apanel], #0x50]\n"
- ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n"
"ldr q1, [%x[Apanel], #0x60]\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n"
"ldr q2, [%x[Apanel], #0x70]\n"
- ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n"
+ ".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"add x22, x22, #0xc0\n"
"bge 3b\n"
@@ -191,89 +194,89 @@ void a64_interleaved_bf16fp32_mmla_8x12(
".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
"cbz x20, 5f\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q7, [x22, #0x10]\n"
- ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
- "ldr q4, [x22, #0x20]\n"
- "ldr q5, [x22, #0x30]\n"
- ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n"
+ "ldr q6, [%x[Apanel], #0x10]\n"
+ "ldr q0, [x22, #0x10]\n"
+ ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n"
+ "ldr q5, [%x[Apanel], #0x20]\n"
+ "ldr q4, [%x[Apanel], #0x30]\n"
+ ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n"
+ "ldr q3, [x22, #0x20]\n"
+ "ldr q2, [x22, #0x30]\n"
+ ".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n"
+ ".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
- "ldr q6, [x22, #0x40]\n"
- ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
- "ldr q7, [x22, #0x50]\n"
- ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
- ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
- ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
- ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n"
+ "ldr q1, [x22, #0x40]\n"
+ ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n"
+ "ldr q0, [x22, #0x50]\n"
+ ".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n"
+ ".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n"
+ ".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n"
+ ".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n"
"add x22, x22, #0x60\n"
- ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
- ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
- ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
- ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
- ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
- ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
- ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
- ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
- ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
- ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n"
+ ".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n"
+ ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n"
+ ".inst 0x6e42ec9e // bfmmla v30.4s, v4.8h, v2.8h\n"
+ ".inst 0x6e41ecea // bfmmla v10.4s, v7.8h, v1.8h\n"
+ ".inst 0x6e40eced // bfmmla v13.4s, v7.8h, v0.8h\n"
+ ".inst 0x6e41ecd0 // bfmmla v16.4s, v6.8h, v1.8h\n"
+ ".inst 0x6e40ecd3 // bfmmla v19.4s, v6.8h, v0.8h\n"
+ ".inst 0x6e41ecb6 // bfmmla v22.4s, v5.8h, v1.8h\n"
+ ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n"
+ ".inst 0x6e41ec9c // bfmmla v28.4s, v4.8h, v1.8h\n"
+ ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 4cc3ed040a..37a54fcfab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return 12;
}
- static unsigned int stripe_width()
- {
- return 4;
- }
-
static constexpr unsigned int k_unroll()
{
return 8;
@@ -111,11 +106,9 @@ public:
break;
}
}
-
};
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
index e46cb8a67a..c1d37383df 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_s8s32_mmla_8x12_a510(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *Apanel,
+ const int8_t *Bpanel,
+ int32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -82,28 +85,28 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
"movi v31.4s, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
+ "ldp q3, q7, [x22], #0x20\n"
".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
"sub x20, x20, #0x2\n"
".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e84a4da // smmla v26.4s, v6.16b, v4.16b\n"
"cmp x20, #0x2\n"
- ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e85a4dd // smmla v29.4s, v6.16b, v5.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e83a409 // smmla v9.4s, v0.16b, v3.16b\n"
".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e83a42f // smmla v15.4s, v1.16b, v3.16b\n"
".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e83a455 // smmla v21.4s, v2.16b, v3.16b\n"
".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
+ ".inst 0x4e83a4db // smmla v27.4s, v6.16b, v3.16b\n"
+ ".inst 0x4e87a4de // smmla v30.4s, v6.16b, v7.16b\n"
+ "ldp q7, q3, [x22], #0x20\n"
".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
@@ -113,39 +116,39 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
- "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e84a4dc // smmla v28.4s, v6.16b, v4.16b\n"
+ ".inst 0x4e85a4df // smmla v31.4s, v6.16b, v5.16b\n"
+ "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
+ ".inst 0x4e83a40b // smmla v11.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e87a42e // smmla v14.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e83a431 // smmla v17.4s, v1.16b, v3.16b\n"
+ ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e83a457 // smmla v23.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e87a4da // smmla v26.4s, v6.16b, v7.16b\n"
+ ".inst 0x4e83a4dd // smmla v29.4s, v6.16b, v3.16b\n"
+ "ldp q7, q3, [x22], #0x20\n"
".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e84a4db // smmla v27.4s, v6.16b, v4.16b\n"
+ ".inst 0x4e85a4de // smmla v30.4s, v6.16b, v5.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e83a40d // smmla v13.4s, v0.16b, v3.16b\n"
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e87a430 // smmla v16.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e83a433 // smmla v19.4s, v1.16b, v3.16b\n"
"ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e83a459 // smmla v25.4s, v2.16b, v3.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e87a4dc // smmla v28.4s, v6.16b, v7.16b\n"
+ ".inst 0x4e83a4df // smmla v31.4s, v6.16b, v3.16b\n"
"bge 3b\n"
"4:" // main loop skip
"ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
@@ -158,7 +161,7 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
- "ldp q4, q5, [x22], #0x20\n"
+ "ldp q5, q4, [x22], #0x20\n"
".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
@@ -167,93 +170,93 @@ void a64_interleaved_s8s32_mmla_8x12_a510(
".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
- ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
- ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e85a40a // smmla v10.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e84a40d // smmla v13.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a430 // smmla v16.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e84a433 // smmla v19.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e85a456 // smmla v22.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a459 // smmla v25.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a47c // smmla v28.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n"
"cbz x20, 5f\n"
- "ldp q6, q7, [x22], #0x20\n"
- "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
- "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldp q4, q5, [x22], #0x20\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
- ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
- ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
- ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ "ldp q1, q0, [x22], #0x20\n"
+ "ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n"
+ "ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
+ "ldp q3, q2, [x22], #0x20\n"
+ ".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n"
+ ".inst 0x4e80a4d1 // smmla v17.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e81a4b4 // smmla v20.4s, v5.16b, v1.16b\n"
+ ".inst 0x4e80a4b7 // smmla v23.4s, v5.16b, v0.16b\n"
+ ".inst 0x4e81a49a // smmla v26.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
+ "ldp q1, q0, [x22], #0x20\n"
+ ".inst 0x4e83a4e9 // smmla v9.4s, v7.16b, v3.16b\n"
+ ".inst 0x4e82a4ec // smmla v12.4s, v7.16b, v2.16b\n"
+ ".inst 0x4e83a4cf // smmla v15.4s, v6.16b, v3.16b\n"
+ ".inst 0x4e82a4d2 // smmla v18.4s, v6.16b, v2.16b\n"
+ ".inst 0x4e83a4b5 // smmla v21.4s, v5.16b, v3.16b\n"
+ ".inst 0x4e82a4b8 // smmla v24.4s, v5.16b, v2.16b\n"
+ ".inst 0x4e83a49b // smmla v27.4s, v4.16b, v3.16b\n"
+ ".inst 0x4e82a49e // smmla v30.4s, v4.16b, v2.16b\n"
+ ".inst 0x4e81a4ea // smmla v10.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e80a4ed // smmla v13.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e81a4d0 // smmla v16.4s, v6.16b, v1.16b\n"
+ ".inst 0x4e80a4d3 // smmla v19.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e81a4b6 // smmla v22.4s, v5.16b, v1.16b\n"
+ ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n"
+ ".inst 0x4e81a49c // smmla v28.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index fc20c2fc9d..a097dc358a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_s8s32_mmla_8x12(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *Apanel,
+ const int8_t *Bpanel,
+ int32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_s8s32_mmla_8x12(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -85,31 +88,31 @@ void a64_interleaved_s8s32_mmla_8x12(
"movi v31.4s, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ldr q3, [%x[Apanel], #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
+ "ldr q6, [%x[Apanel], #0x0]\n"
+ "ldr q7, [x22, #0x0]\n"
".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
- "ldr q7, [x22, #0x10]\n"
+ "ldr q3, [x22, #0x10]\n"
".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
"sub x20, x20, #0x2\n"
".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e84a4da // smmla v26.4s, v6.16b, v4.16b\n"
"ldr q4, [x22, #0x20]\n"
- ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e85a4dd // smmla v29.4s, v6.16b, v5.16b\n"
"ldr q5, [x22, #0x30]\n"
- ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e83a40c // smmla v12.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
"cmp x20, #0x2\n"
- ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
- "ldr q6, [x22, #0x40]\n"
- ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
- "ldr q7, [x22, #0x50]\n"
+ ".inst 0x4e83a432 // smmla v18.4s, v1.16b, v3.16b\n"
+ ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e87a4db // smmla v27.4s, v6.16b, v7.16b\n"
+ "ldr q7, [x22, #0x40]\n"
+ ".inst 0x4e83a4de // smmla v30.4s, v6.16b, v3.16b\n"
+ "ldr q3, [x22, #0x50]\n"
".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
"ldr q0, [%x[Apanel], #0x10]\n"
@@ -119,42 +122,42 @@ void a64_interleaved_s8s32_mmla_8x12(
".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
"ldr q2, [%x[Apanel], #0x30]\n"
- ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e84a4dc // smmla v28.4s, v6.16b, v4.16b\n"
"ldr q4, [x22, #0x60]\n"
- ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
- "ldr q3, [%x[Apanel], #0x40]\n"
+ ".inst 0x4e85a4df // smmla v31.4s, v6.16b, v5.16b\n"
+ "ldr q6, [%x[Apanel], #0x40]\n"
"ldr q5, [x22, #0x70]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
- "ldr q6, [x22, #0x80]\n"
- ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
- "ldr q7, [x22, #0x90]\n"
+ ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e83a40b // smmla v11.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e87a42e // smmla v14.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e83a431 // smmla v17.4s, v1.16b, v3.16b\n"
+ ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e83a457 // smmla v23.4s, v2.16b, v3.16b\n"
+ ".inst 0x4e87a4da // smmla v26.4s, v6.16b, v7.16b\n"
+ "ldr q7, [x22, #0x80]\n"
+ ".inst 0x4e83a4dd // smmla v29.4s, v6.16b, v3.16b\n"
+ "ldr q3, [x22, #0x90]\n"
".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e84a4db // smmla v27.4s, v6.16b, v4.16b\n"
"ldr q4, [x22, #0xa0]\n"
- ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e85a4de // smmla v30.4s, v6.16b, v5.16b\n"
"ldr q5, [x22, #0xb0]\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e83a40d // smmla v13.4s, v0.16b, v3.16b\n"
"ldr q0, [%x[Apanel], #0x50]\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e87a430 // smmla v16.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e83a433 // smmla v19.4s, v1.16b, v3.16b\n"
"ldr q1, [%x[Apanel], #0x60]\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e83a459 // smmla v25.4s, v2.16b, v3.16b\n"
"ldr q2, [%x[Apanel], #0x70]\n"
- ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e87a4dc // smmla v28.4s, v6.16b, v7.16b\n"
+ ".inst 0x4e83a4df // smmla v31.4s, v6.16b, v3.16b\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"add x22, x22, #0xc0\n"
"bge 3b\n"
@@ -191,89 +194,89 @@ void a64_interleaved_s8s32_mmla_8x12(
".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
"cbz x20, 5f\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q7, [x22, #0x10]\n"
- ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
- "ldr q4, [x22, #0x20]\n"
- "ldr q5, [x22, #0x30]\n"
- ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n"
+ "ldr q6, [%x[Apanel], #0x10]\n"
+ "ldr q0, [x22, #0x10]\n"
+ ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n"
+ "ldr q5, [%x[Apanel], #0x20]\n"
+ "ldr q4, [%x[Apanel], #0x30]\n"
+ ".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n"
+ "ldr q3, [x22, #0x20]\n"
+ "ldr q2, [x22, #0x30]\n"
+ ".inst 0x4e80a4d1 // smmla v17.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e81a4b4 // smmla v20.4s, v5.16b, v1.16b\n"
+ ".inst 0x4e80a4b7 // smmla v23.4s, v5.16b, v0.16b\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
- "ldr q6, [x22, #0x40]\n"
- ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
- "ldr q7, [x22, #0x50]\n"
- ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
- ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
- ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
- ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e81a49a // smmla v26.4s, v4.16b, v1.16b\n"
+ "ldr q1, [x22, #0x40]\n"
+ ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x22, #0x50]\n"
+ ".inst 0x4e83a4e9 // smmla v9.4s, v7.16b, v3.16b\n"
+ ".inst 0x4e82a4ec // smmla v12.4s, v7.16b, v2.16b\n"
+ ".inst 0x4e83a4cf // smmla v15.4s, v6.16b, v3.16b\n"
+ ".inst 0x4e82a4d2 // smmla v18.4s, v6.16b, v2.16b\n"
"add x22, x22, #0x60\n"
- ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
- ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
- ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
- ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
- ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e83a4b5 // smmla v21.4s, v5.16b, v3.16b\n"
+ ".inst 0x4e82a4b8 // smmla v24.4s, v5.16b, v2.16b\n"
+ ".inst 0x4e83a49b // smmla v27.4s, v4.16b, v3.16b\n"
+ ".inst 0x4e82a49e // smmla v30.4s, v4.16b, v2.16b\n"
+ ".inst 0x4e81a4ea // smmla v10.4s, v7.16b, v1.16b\n"
+ ".inst 0x4e80a4ed // smmla v13.4s, v7.16b, v0.16b\n"
+ ".inst 0x4e81a4d0 // smmla v16.4s, v6.16b, v1.16b\n"
+ ".inst 0x4e80a4d3 // smmla v19.4s, v6.16b, v0.16b\n"
+ ".inst 0x4e81a4b6 // smmla v22.4s, v5.16b, v1.16b\n"
+ ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n"
+ ".inst 0x4e81a49c // smmla v28.4s, v4.16b, v1.16b\n"
+ ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index fa93c1d90d..0088557b8d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
+
#include "../std_transforms_fixed.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return 12;
}
- static unsigned int stripe_width()
- {
- return 4;
- }
-
static constexpr unsigned int k_unroll()
{
return 8;
@@ -116,5 +111,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
index 83301d80bb..54c51954c8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_u8u32_mmla_8x12_a510(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *Apanel,
+ const uint8_t *Bpanel,
+ uint32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -82,28 +85,28 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
"movi v31.4s, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
+ "ldp q3, q7, [x22], #0x20\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
"sub x20, x20, #0x2\n"
".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e84a4da // ummla v26.4s, v6.16b, v4.16b\n"
"cmp x20, #0x2\n"
- ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e85a4dd // ummla v29.4s, v6.16b, v5.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e83a409 // ummla v9.4s, v0.16b, v3.16b\n"
".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e83a42f // ummla v15.4s, v1.16b, v3.16b\n"
".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e83a455 // ummla v21.4s, v2.16b, v3.16b\n"
".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
+ ".inst 0x6e83a4db // ummla v27.4s, v6.16b, v3.16b\n"
+ ".inst 0x6e87a4de // ummla v30.4s, v6.16b, v7.16b\n"
+ "ldp q7, q3, [x22], #0x20\n"
".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
@@ -113,39 +116,39 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
- ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
- "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e84a4dc // ummla v28.4s, v6.16b, v4.16b\n"
+ ".inst 0x6e85a4df // ummla v31.4s, v6.16b, v5.16b\n"
+ "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
+ ".inst 0x6e83a40b // ummla v11.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e87a42e // ummla v14.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e83a431 // ummla v17.4s, v1.16b, v3.16b\n"
+ ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e83a457 // ummla v23.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e87a4da // ummla v26.4s, v6.16b, v7.16b\n"
+ ".inst 0x6e83a4dd // ummla v29.4s, v6.16b, v3.16b\n"
+ "ldp q7, q3, [x22], #0x20\n"
".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
- ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e84a4db // ummla v27.4s, v6.16b, v4.16b\n"
+ ".inst 0x6e85a4de // ummla v30.4s, v6.16b, v5.16b\n"
"ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e83a40d // ummla v13.4s, v0.16b, v3.16b\n"
"ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e87a430 // ummla v16.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e83a433 // ummla v19.4s, v1.16b, v3.16b\n"
"ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e83a459 // ummla v25.4s, v2.16b, v3.16b\n"
"ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e87a4dc // ummla v28.4s, v6.16b, v7.16b\n"
+ ".inst 0x6e83a4df // ummla v31.4s, v6.16b, v3.16b\n"
"bge 3b\n"
"4:" // main loop skip
"ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
@@ -158,7 +161,7 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
- "ldp q4, q5, [x22], #0x20\n"
+ "ldp q5, q4, [x22], #0x20\n"
".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
@@ -167,93 +170,93 @@ void a64_interleaved_u8u32_mmla_8x12_a510(
".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
- ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
- ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
- ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
- ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
- ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
- ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e85a40a // ummla v10.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e84a40d // ummla v13.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a430 // ummla v16.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a433 // ummla v19.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e85a456 // ummla v22.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a459 // ummla v25.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a47c // ummla v28.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n"
"cbz x20, 5f\n"
- "ldp q6, q7, [x22], #0x20\n"
- "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
- "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
- "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- "ldp q4, q5, [x22], #0x20\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
- "ldp q6, q7, [x22], #0x20\n"
- ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
- ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
- ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
- ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
- ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
- ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ "ldp q1, q0, [x22], #0x20\n"
+ "ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n"
+ "ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
+ "ldp q3, q2, [x22], #0x20\n"
+ ".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n"
+ ".inst 0x6e80a4d1 // ummla v17.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e81a4b4 // ummla v20.4s, v5.16b, v1.16b\n"
+ ".inst 0x6e80a4b7 // ummla v23.4s, v5.16b, v0.16b\n"
+ ".inst 0x6e81a49a // ummla v26.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
+ "ldp q1, q0, [x22], #0x20\n"
+ ".inst 0x6e83a4e9 // ummla v9.4s, v7.16b, v3.16b\n"
+ ".inst 0x6e82a4ec // ummla v12.4s, v7.16b, v2.16b\n"
+ ".inst 0x6e83a4cf // ummla v15.4s, v6.16b, v3.16b\n"
+ ".inst 0x6e82a4d2 // ummla v18.4s, v6.16b, v2.16b\n"
+ ".inst 0x6e83a4b5 // ummla v21.4s, v5.16b, v3.16b\n"
+ ".inst 0x6e82a4b8 // ummla v24.4s, v5.16b, v2.16b\n"
+ ".inst 0x6e83a49b // ummla v27.4s, v4.16b, v3.16b\n"
+ ".inst 0x6e82a49e // ummla v30.4s, v4.16b, v2.16b\n"
+ ".inst 0x6e81a4ea // ummla v10.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e80a4ed // ummla v13.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e81a4d0 // ummla v16.4s, v6.16b, v1.16b\n"
+ ".inst 0x6e80a4d3 // ummla v19.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e81a4b6 // ummla v22.4s, v5.16b, v1.16b\n"
+ ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n"
+ ".inst 0x6e81a49c // ummla v28.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index c5342197c1..30260b9c29 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void a64_interleaved_u8u32_mmla_8x12(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *Apanel,
+ const uint8_t *Bpanel,
+ uint32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -43,7 +47,6 @@ void a64_interleaved_u8u32_mmla_8x12(
ka.bblocks = bblocks;
__asm__ __volatile__(
-
"1:" // Height loop
"ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -85,31 +88,31 @@ void a64_interleaved_u8u32_mmla_8x12(
"movi v31.4s, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ldr q3, [%x[Apanel], #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
+ "ldr q6, [%x[Apanel], #0x0]\n"
+ "ldr q7, [x22, #0x0]\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "ldr q7, [x22, #0x10]\n"
+ "ldr q3, [x22, #0x10]\n"
".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
"sub x20, x20, #0x2\n"
".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e84a4da // ummla v26.4s, v6.16b, v4.16b\n"
"ldr q4, [x22, #0x20]\n"
- ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e85a4dd // ummla v29.4s, v6.16b, v5.16b\n"
"ldr q5, [x22, #0x30]\n"
- ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e83a40c // ummla v12.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
"cmp x20, #0x2\n"
- ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
- "ldr q6, [x22, #0x40]\n"
- ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
- "ldr q7, [x22, #0x50]\n"
+ ".inst 0x6e83a432 // ummla v18.4s, v1.16b, v3.16b\n"
+ ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e87a4db // ummla v27.4s, v6.16b, v7.16b\n"
+ "ldr q7, [x22, #0x40]\n"
+ ".inst 0x6e83a4de // ummla v30.4s, v6.16b, v3.16b\n"
+ "ldr q3, [x22, #0x50]\n"
".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
"ldr q0, [%x[Apanel], #0x10]\n"
@@ -119,42 +122,42 @@ void a64_interleaved_u8u32_mmla_8x12(
".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
"ldr q2, [%x[Apanel], #0x30]\n"
- ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e84a4dc // ummla v28.4s, v6.16b, v4.16b\n"
"ldr q4, [x22, #0x60]\n"
- ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
- "ldr q3, [%x[Apanel], #0x40]\n"
+ ".inst 0x6e85a4df // ummla v31.4s, v6.16b, v5.16b\n"
+ "ldr q6, [%x[Apanel], #0x40]\n"
"ldr q5, [x22, #0x70]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
- "ldr q6, [x22, #0x80]\n"
- ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
- "ldr q7, [x22, #0x90]\n"
+ ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e83a40b // ummla v11.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e87a42e // ummla v14.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e83a431 // ummla v17.4s, v1.16b, v3.16b\n"
+ ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e83a457 // ummla v23.4s, v2.16b, v3.16b\n"
+ ".inst 0x6e87a4da // ummla v26.4s, v6.16b, v7.16b\n"
+ "ldr q7, [x22, #0x80]\n"
+ ".inst 0x6e83a4dd // ummla v29.4s, v6.16b, v3.16b\n"
+ "ldr q3, [x22, #0x90]\n"
".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e84a4db // ummla v27.4s, v6.16b, v4.16b\n"
"ldr q4, [x22, #0xa0]\n"
- ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e85a4de // ummla v30.4s, v6.16b, v5.16b\n"
"ldr q5, [x22, #0xb0]\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e83a40d // ummla v13.4s, v0.16b, v3.16b\n"
"ldr q0, [%x[Apanel], #0x50]\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e87a430 // ummla v16.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e83a433 // ummla v19.4s, v1.16b, v3.16b\n"
"ldr q1, [%x[Apanel], #0x60]\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e83a459 // ummla v25.4s, v2.16b, v3.16b\n"
"ldr q2, [%x[Apanel], #0x70]\n"
- ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e87a4dc // ummla v28.4s, v6.16b, v7.16b\n"
+ ".inst 0x6e83a4df // ummla v31.4s, v6.16b, v3.16b\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"add x22, x22, #0xc0\n"
"bge 3b\n"
@@ -191,89 +194,89 @@ void a64_interleaved_u8u32_mmla_8x12(
".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
"cbz x20, 5f\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q7, [x22, #0x10]\n"
- ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- "ldr q4, [x22, #0x20]\n"
- "ldr q5, [x22, #0x30]\n"
- ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q7, [%x[Apanel], #0x0]\n"
+ ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n"
+ "ldr q6, [%x[Apanel], #0x10]\n"
+ "ldr q0, [x22, #0x10]\n"
+ ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n"
+ "ldr q5, [%x[Apanel], #0x20]\n"
+ "ldr q4, [%x[Apanel], #0x30]\n"
+ ".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n"
+ "ldr q3, [x22, #0x20]\n"
+ "ldr q2, [x22, #0x30]\n"
+ ".inst 0x6e80a4d1 // ummla v17.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e81a4b4 // ummla v20.4s, v5.16b, v1.16b\n"
+ ".inst 0x6e80a4b7 // ummla v23.4s, v5.16b, v0.16b\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
- "ldr q6, [x22, #0x40]\n"
- ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
- "ldr q7, [x22, #0x50]\n"
- ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
- ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
- ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
- ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e81a49a // ummla v26.4s, v4.16b, v1.16b\n"
+ "ldr q1, [x22, #0x40]\n"
+ ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n"
+ "ldr q0, [x22, #0x50]\n"
+ ".inst 0x6e83a4e9 // ummla v9.4s, v7.16b, v3.16b\n"
+ ".inst 0x6e82a4ec // ummla v12.4s, v7.16b, v2.16b\n"
+ ".inst 0x6e83a4cf // ummla v15.4s, v6.16b, v3.16b\n"
+ ".inst 0x6e82a4d2 // ummla v18.4s, v6.16b, v2.16b\n"
"add x22, x22, #0x60\n"
- ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
- ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
- ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
- ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
- ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
- ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
- ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
- ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
- ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
- ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
- ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
- ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e83a4b5 // ummla v21.4s, v5.16b, v3.16b\n"
+ ".inst 0x6e82a4b8 // ummla v24.4s, v5.16b, v2.16b\n"
+ ".inst 0x6e83a49b // ummla v27.4s, v4.16b, v3.16b\n"
+ ".inst 0x6e82a49e // ummla v30.4s, v4.16b, v2.16b\n"
+ ".inst 0x6e81a4ea // ummla v10.4s, v7.16b, v1.16b\n"
+ ".inst 0x6e80a4ed // ummla v13.4s, v7.16b, v0.16b\n"
+ ".inst 0x6e81a4d0 // ummla v16.4s, v6.16b, v1.16b\n"
+ ".inst 0x6e80a4d3 // ummla v19.4s, v6.16b, v0.16b\n"
+ ".inst 0x6e81a4b6 // ummla v22.4s, v5.16b, v1.16b\n"
+ ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n"
+ ".inst 0x6e81a49c // ummla v28.4s, v4.16b, v1.16b\n"
+ ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n"
"5:" // multiply loop done
"subs x23, x23, #0x1\n"
- "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp1 v0.2d, v8.2d, v11.2d\n"
"uzp2 v8.2d, v8.2d, v11.2d\n"
- "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp1 v1.2d, v9.2d, v12.2d\n"
"uzp2 v9.2d, v9.2d, v12.2d\n"
- "str q4, [%x[Cpanel], #0x0]\n"
- "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "str q0, [%x[Cpanel], #0x0]\n"
+ "uzp1 v0.2d, v10.2d, v13.2d\n"
"uzp2 v10.2d, v10.2d, v13.2d\n"
- "str q11, [%x[Cpanel], #0x10]\n"
- "str q12, [%x[Cpanel], #0x20]\n"
- "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "str q1, [%x[Cpanel], #0x10]\n"
+ "str q0, [%x[Cpanel], #0x20]\n"
+ "uzp1 v0.2d, v14.2d, v17.2d\n"
"uzp2 v14.2d, v14.2d, v17.2d\n"
"str q8, [%x[Cpanel], #0x30]\n"
- "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp1 v2.2d, v15.2d, v18.2d\n"
"uzp2 v15.2d, v15.2d, v18.2d\n"
"str q9, [%x[Cpanel], #0x40]\n"
- "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp1 v17.2d, v16.2d, v19.2d\n"
"uzp2 v16.2d, v16.2d, v19.2d\n"
"str q10, [%x[Cpanel], #0x50]\n"
- "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp1 v1.2d, v20.2d, v23.2d\n"
"uzp2 v20.2d, v20.2d, v23.2d\n"
- "str q13, [%x[Cpanel], #0x60]\n"
- "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "str q0, [%x[Cpanel], #0x60]\n"
+ "uzp1 v0.2d, v21.2d, v24.2d\n"
"uzp2 v21.2d, v21.2d, v24.2d\n"
- "str q17, [%x[Cpanel], #0x70]\n"
- "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "str q2, [%x[Cpanel], #0x70]\n"
+ "uzp1 v23.2d, v22.2d, v25.2d\n"
"uzp2 v22.2d, v22.2d, v25.2d\n"
- "str q18, [%x[Cpanel], #0x80]\n"
- "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "str q17, [%x[Cpanel], #0x80]\n"
+ "uzp1 v19.2d, v26.2d, v29.2d\n"
"uzp2 v26.2d, v26.2d, v29.2d\n"
"str q14, [%x[Cpanel], #0x90]\n"
- "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp1 v18.2d, v27.2d, v30.2d\n"
"uzp2 v27.2d, v27.2d, v30.2d\n"
"str q15, [%x[Cpanel], #0xa0]\n"
- "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp1 v17.2d, v28.2d, v31.2d\n"
"uzp2 v28.2d, v28.2d, v31.2d\n"
"str q16, [%x[Cpanel], #0xb0]\n"
- "str q19, [%x[Cpanel], #0xc0]\n"
- "str q23, [%x[Cpanel], #0xd0]\n"
- "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q1, [%x[Cpanel], #0xc0]\n"
+ "str q0, [%x[Cpanel], #0xd0]\n"
+ "str q23, [%x[Cpanel], #0xe0]\n"
"str q20, [%x[Cpanel], #0xf0]\n"
"str q21, [%x[Cpanel], #0x100]\n"
"str q22, [%x[Cpanel], #0x110]\n"
- "str q25, [%x[Cpanel], #0x120]\n"
- "str q29, [%x[Cpanel], #0x130]\n"
- "str q30, [%x[Cpanel], #0x140]\n"
+ "str q19, [%x[Cpanel], #0x120]\n"
+ "str q18, [%x[Cpanel], #0x130]\n"
+ "str q17, [%x[Cpanel], #0x140]\n"
"str q26, [%x[Cpanel], #0x150]\n"
"str q27, [%x[Cpanel], #0x160]\n"
"str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
index f86bcebe64..76f43f0933 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,19 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "../std_transforms_sme.hpp"
#include "../bfloat.hpp"
@@ -84,4 +83,4 @@ public:
#undef ARGLIST
-#endif // __aarch64__
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
index 520eeedfec..db29e42ef1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "arm_gemm.hpp"
#include "../../utils.hpp"
@@ -62,7 +62,7 @@ void sme2_gemv_bf16fp32_dot_16VL (
break;
}
__asm__ __volatile__(
- "ptrue p1.b\n"
+ "ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
"cntw x28, ALL, MUL #4\n"
"add x27, %x[N], x28\n"
@@ -102,311 +102,311 @@ void sme2_gemv_bf16fp32_dot_16VL (
"bgt 20f\n"
"beq 12f\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x1\n"
+ "lsl x21, %x[K], #0x1\n"
"mov x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 5f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"6:" // Width 1: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 8f\n"
"7:" // Width 1: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "sub x21, x21, #0x8\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z8.h }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x8\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[0]\n"
"addvl x26, x26, #16\n"
- "cmp x21, #0x8\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ "cmp x22, #0x8\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b498 // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[1]\n"
"addvl x26, x26, #16\n"
"add x23, x23, #0x10\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158bb98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z8.h[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158bf18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z8.h[3]\n"
"addvl x26, x26, #16\n"
"bgt 7b\n"
"8:" // Width 1: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- "addvl x26, x26, #16\n"
- "ble 9f\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xc15bb398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bb598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+ "addvl x26, x26, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bbc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
"addvl x26, x26, #16\n"
"9:" // Width 1: Multiply loop: multiply skip
"tbz %x[flags], #1, 10f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ "ld1rw { z3.s }, p1/Z, [x21]\n"
+ "ld1rw { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc1bdc868 // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"b 11f\n"
"10:" // Width 1: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c32c // st1w { z12.s-z15.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"11:" // Width 1: Output done
"b 36f\n"
"12:" // Width 2
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x1\n"
+ "lsl x21, %x[K], #0x1\n"
"sub x20, %x[N], x28\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 13f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
"b 14f\n"
"13:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"14:" // Width 2: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 16f\n"
"15:" // Width 2: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "sub x21, x21, #0x8\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- "cmp x21, #0x8\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z9.h }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x8\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[0]\n"
+ "cmp x22, #0x8\n"
"add x23, x23, #0x10\n"
".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xc159b099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[0]\n"
"addvl x26, x26, #16\n"
".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ ".inst 0xc159b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[1]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159bb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z9.h[2]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159b819 // bfdot za.s[x9, 1], { z0.h-z3.h }, z9.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159bc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z9.h[3]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159bf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z9.h[3]\n"
"addvl x26, x26, #16\n"
"bgt 15b\n"
"16:" // Width 2: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ ".inst 0xc15bb198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bb718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb419 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[1]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bb998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xc15bbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbe99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
"addvl x26, x26, #16\n"
"17:" // Width 2: Multiply loop: multiply skip
"tbz %x[flags], #1, 18f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
- ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ "ld1rw { z9.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ "ld1rw { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1a8c920 // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc1a8c924 // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+ ".inst 0xa061c324 // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"b 19f\n"
"18:" // Width 2: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"19:" // Width 2: Output done
"b 36f\n"
"20:" // Width 3
"mov x20, #0x2\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x1\n"
+ "lsl x21, %x[K], #0x1\n"
"msub x20, x28, x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 21f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
- ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
+ ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
"b 22f\n"
"21:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"22:" // Width 3: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 24f\n"
"23:" // Width 3: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "sub x21, x21, #0x8\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z15.h }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x8\n"
".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- "cmp x21, #0x8\n"
+ ".inst 0xc15fb018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[0]\n"
+ "cmp x22, #0x8\n"
"add x23, x23, #0x10\n"
".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
- ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+ ".inst 0xc15fb099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fb698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb51a // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[2]\n"
+ ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb919 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[2]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xc15fbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fbe19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbd1a // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
"bgt 23b\n"
"24:" // Width 3: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ ".inst 0xc15bb398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
- "addvl x26, x26, #16\n"
- "ble 25f\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xc15bb29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z11.h[0]\n"
"addvl x26, x26, #16\n"
"ble 25f\n"
".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bb598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z11.h[1]\n"
".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ ".inst 0xc15bb79a // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[1]\n"
"addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n"
+ ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb99a // bfdot za.s[x9, 2], { z12.h-z15.h }, z11.h[2]\n"
+ "addvl x26, x26, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bbd98 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[3]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bbe99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xc15bbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
"addvl x26, x26, #16\n"
"25:" // Width 3: Multiply loop: multiply skip
"tbz %x[flags], #1, 26f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
- ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ "ld1rw { z17.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+ "ld1rw { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+ ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"b 27f\n"
"26:" // Width 3: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"27:" // Width 3: Output done
"b 36f\n"
"28:" // Width 4
"mov x20, #0x3\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x1\n"
+ "lsl x21, %x[K], #0x1\n"
"msub x20, x28, x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 29f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
- ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n"
".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
"addvl x24, x24, #16\n"
@@ -414,126 +414,126 @@ void sme2_gemv_bf16fp32_dot_16VL (
"29:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"30:" // Width 4: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 32f\n"
"31:" // Width 4: Multiply loop: Main loop head
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "sub x21, x21, #0x8\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- "cmp x21, #0x8\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z8.h }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x8\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z8.h[0]\n"
+ "cmp x22, #0x8\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
- ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
- ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15ab21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
- ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15ab61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158b199 // bfdot za.s[x9, 1], { z12.h-z15.h }, z8.h[0]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[0]\n"
+ ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158b19b // bfdot za.s[x9, 3], { z12.h-z15.h }, z8.h[0]\n"
"addvl x26, x26, #16\n"
".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
- ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ ".inst 0xc158b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z8.h[1]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158b699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z8.h[1]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158b61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[1]\n"
".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+ ".inst 0xc158b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
- ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158b898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158ba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[2]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
- ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15abe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xc158ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[2]\n"
+ ".inst 0xa043a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158b81b // bfdot za.s[x9, 3], { z0.h-z3.h }, z8.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158be98 // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[3]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158be19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[3]\n"
+ ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158bc9a // bfdot za.s[x9, 2], { z4.h-z7.h }, z8.h[3]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158be9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[3]\n"
"addvl x26, x26, #16\n"
"bgt 31b\n"
"32:" // Width 4: Multiply loop: Single iteration only
- "whilelt p0.h, XZR, x21\n"
- "ld1rqh { z10.h }, p0/Z, [x23]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "whilelt p0.h, XZR, x22\n"
+ "ld1rqh { z11.h }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
- ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+ ".inst 0xc15bb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n"
+ ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bb299 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[0]\n"
+ ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb39a // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[0]\n"
".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15ab21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
+ ".inst 0xc15bb21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[0]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bb418 // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[1]\n"
".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
- ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xc15bb619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[1]\n"
".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15ab61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xc15bb61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[1]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15bba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
- ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
- ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+ ".inst 0xc15bba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[2]\n"
+ ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15bba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[2]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15bba1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[2]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xc15bbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xc15bbf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xc15bbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15abe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xc15bbe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[3]\n"
"addvl x26, x26, #16\n"
"33:" // Width 4: Multiply loop: multiply skip
"tbz %x[flags], #1, 34f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
- ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
- ".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xc1a6c818 // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
- ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ "ld1rw { z21.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+ "ld1rw { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"b 35f\n"
"34:" // Width 4: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
- ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+ ".inst 0xa063c324 // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"35:" // Width 4: Output done
"subs x27, x27, #0x4\n"
@@ -541,7 +541,7 @@ void sme2_gemv_bf16fp32_dot_16VL (
"bgt 4b\n"
"36:" // Exit
".inst 0xd503467f // SMSTOP\n"
- "ptrue p1.b\n"
+ "ptrue p8.b\n"
: [N] "+&r" (N)
: [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -550,5 +550,4 @@ void sme2_gemv_bf16fp32_dot_16VL (
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
index f33cb9a33d..7d98d5cb98 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,19 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "../std_transforms_sme.hpp"
#define ARGLIST \
@@ -83,4 +82,4 @@ public:
#undef ARGLIST
-#endif // __aarch64__
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
index 9224868e6a..d2c260536d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "arm_gemm.hpp"
#include "../../utils.hpp"
@@ -61,7 +61,7 @@ void sme2_gemv_fp32_mla_16VL (
break;
}
__asm__ __volatile__(
- "ptrue p1.b\n"
+ "ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
"cntw x28, ALL, MUL #4\n"
"add x27, %x[N], x28\n"
@@ -101,311 +101,311 @@ void sme2_gemv_fp32_mla_16VL (
"bgt 20f\n"
"beq 12f\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"mov x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 5f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"6:" // Width 1: setup done
- "cmp x21, #0x4\n"
+ "cmp x22, #0x4\n"
"ble 8f\n"
"7:" // Width 1: Multiply loop: Main loop head
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "sub x21, x21, #0x4\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z8.s }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x4\n"
+ ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a280 // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[0]\n"
"addvl x26, x26, #16\n"
- "cmp x21, #0x4\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ "cmp x22, #0x4\n"
+ ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a480 // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[1]\n"
"addvl x26, x26, #16\n"
"add x23, x23, #0x10\n"
- ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158ab80 // fmla za.s[x9, 0], { z28.s-z31.s }, z8.s[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158af00 // fmla za.s[x9, 0], { z24.s-z27.s }, z8.s[3]\n"
"addvl x26, x26, #16\n"
"bgt 7b\n"
"8:" // Width 1: Multiply loop: Single iteration only
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- "addvl x26, x26, #16\n"
- "ble 9f\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xc15ba380 // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15ba580 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15baa00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+ "addvl x26, x26, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bac00 // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[3]\n"
"addvl x26, x26, #16\n"
"9:" // Width 1: Multiply loop: multiply skip
"tbz %x[flags], #1, 10f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ "ld1rw { z3.s }, p1/Z, [x21]\n"
+ "ld1rw { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc1bdc868 // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"b 11f\n"
"10:" // Width 1: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c32c // st1w { z12.s-z15.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"11:" // Width 1: Output done
"b 36f\n"
"12:" // Width 2
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"sub x20, %x[N], x28\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 13f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
"b 14f\n"
"13:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"14:" // Width 2: setup done
- "cmp x21, #0x4\n"
+ "cmp x22, #0x4\n"
"ble 16f\n"
"15:" // Width 2: Multiply loop: Main loop head
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "sub x21, x21, #0x4\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- "cmp x21, #0x4\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z9.s }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x4\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159a180 // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[0]\n"
+ "cmp x22, #0x4\n"
"add x23, x23, #0x10\n"
".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xc159a081 // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[0]\n"
"addvl x26, x26, #16\n"
".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ ".inst 0xc159a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[1]\n"
+ ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159a481 // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
- ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159ab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z9.s[2]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159a801 // fmla za.s[x9, 1], { z0.s-z3.s }, z9.s[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc159ac00 // fmla za.s[x9, 0], { z0.s-z3.s }, z9.s[3]\n"
+ ".inst 0xa041c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc159af81 // fmla za.s[x9, 1], { z28.s-z31.s }, z9.s[3]\n"
"addvl x26, x26, #16\n"
"bgt 15b\n"
"16:" // Width 2: Multiply loop: Single iteration only
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ ".inst 0xc15ba180 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[0]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba001 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15ba700 // fmla za.s[x9, 0], { z24.s-z27.s }, z11.s[1]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba401 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[1]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15ba980 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[2]\n"
+ ".inst 0xa041c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bab81 // fmla za.s[x9, 1], { z28.s-z31.s }, z11.s[2]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
- ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xc15bae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
"addvl x26, x26, #16\n"
"17:" // Width 2: Multiply loop: multiply skip
"tbz %x[flags], #1, 18f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
- ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ "ld1rw { z9.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ "ld1rw { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1a8c920 // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc1a8c924 // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+ ".inst 0xa061c324 // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"b 19f\n"
"18:" // Width 2: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"19:" // Width 2: Output done
"b 36f\n"
"20:" // Width 3
"mov x20, #0x2\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"msub x20, x28, x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 21f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
- ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
+ ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
"b 22f\n"
"21:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"22:" // Width 3: setup done
- "cmp x21, #0x4\n"
+ "cmp x22, #0x4\n"
"ble 24f\n"
"23:" // Width 3: Multiply loop: Main loop head
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "sub x21, x21, #0x4\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z15.s }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x4\n"
".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- "cmp x21, #0x4\n"
+ ".inst 0xc15fa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z15.s[0]\n"
+ "cmp x22, #0x4\n"
"add x23, x23, #0x10\n"
".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
- ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+ ".inst 0xc15fa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z15.s[0]\n"
+ ".inst 0xa042c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fa002 // fmla za.s[x9, 2], { z0.s-z3.s }, z15.s[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
- ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fa680 // fmla za.s[x9, 0], { z20.s-z23.s }, z15.s[1]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fa681 // fmla za.s[x9, 1], { z20.s-z23.s }, z15.s[1]\n"
+ ".inst 0xa042c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fa502 // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
- ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z15.s[2]\n"
+ ".inst 0xa041c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fa901 // fmla za.s[x9, 1], { z8.s-z11.s }, z15.s[2]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15faa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z15.s[2]\n"
"addvl x26, x26, #16\n"
".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
- ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
- ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xc15fae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z15.s[3]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fae01 // fmla za.s[x9, 1], { z16.s-z19.s }, z15.s[3]\n"
+ ".inst 0xa042c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fad02 // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[3]\n"
"addvl x26, x26, #16\n"
"bgt 23b\n"
"24:" // Width 3: Multiply loop: Single iteration only
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ ".inst 0xc15ba380 // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
+ ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba001 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
- "addvl x26, x26, #16\n"
- "ble 25f\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
- ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xc15ba282 // fmla za.s[x9, 2], { z20.s-z23.s }, z11.s[0]\n"
"addvl x26, x26, #16\n"
"ble 25f\n"
".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15ba580 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
+ ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba481 // fmla za.s[x9, 1], { z4.s-z7.s }, z11.s[1]\n"
".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ ".inst 0xc15ba782 // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[1]\n"
"addvl x26, x26, #16\n"
"ble 25f\n"
- ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
- ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15ba880 // fmla za.s[x9, 0], { z4.s-z7.s }, z11.s[2]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15baa81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[2]\n"
+ ".inst 0xa042c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba982 // fmla za.s[x9, 2], { z12.s-z15.s }, z11.s[2]\n"
+ "addvl x26, x26, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15bad80 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[3]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15bae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xc15bae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
"addvl x26, x26, #16\n"
"25:" // Width 3: Multiply loop: multiply skip
"tbz %x[flags], #1, 26f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
- ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ "ld1rw { z17.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+ "ld1rw { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+ ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"b 27f\n"
"26:" // Width 3: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"27:" // Width 3: Output done
"b 36f\n"
"28:" // Width 4
"mov x20, #0x3\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"msub x20, x28, x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 29f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
- ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n"
".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
"addvl x24, x24, #16\n"
@@ -413,126 +413,126 @@ void sme2_gemv_fp32_mla_16VL (
"29:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"30:" // Width 4: setup done
- "cmp x21, #0x4\n"
+ "cmp x22, #0x4\n"
"ble 32f\n"
"31:" // Width 4: Multiply loop: Main loop head
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "sub x21, x21, #0x4\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- "cmp x21, #0x4\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z8.s }, p0/Z, [x23]\n"
+ "sub x22, x22, #0x4\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a200 // fmla za.s[x9, 0], { z16.s-z19.s }, z8.s[0]\n"
+ "cmp x22, #0x4\n"
"add x23, x23, #0x10\n"
- ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
- ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
- ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aa203 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
- ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
- ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aa603 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xa041c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158a181 // fmla za.s[x9, 1], { z12.s-z15.s }, z8.s[0]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158a202 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[0]\n"
+ ".inst 0xa043c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158a183 // fmla za.s[x9, 3], { z12.s-z15.s }, z8.s[0]\n"
"addvl x26, x26, #16\n"
".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
- ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
- ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ ".inst 0xc158a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z8.s[1]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158a681 // fmla za.s[x9, 1], { z20.s-z23.s }, z8.s[1]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158a602 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[1]\n"
".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aaa83 // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+ ".inst 0xc158a683 // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
- ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158a880 // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[2]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158aa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[2]\n"
".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
- ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xc158aa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[2]\n"
+ ".inst 0xa043c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158a803 // fmla za.s[x9, 3], { z0.s-z3.s }, z8.s[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xc158ae80 // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[3]\n"
+ ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc158ae01 // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[3]\n"
+ ".inst 0xa042c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc158ac82 // fmla za.s[x9, 2], { z4.s-z7.s }, z8.s[3]\n"
+ ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc158ae83 // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[3]\n"
"addvl x26, x26, #16\n"
"bgt 31b\n"
"32:" // Width 4: Multiply loop: Single iteration only
- "whilelt p0.s, XZR, x21\n"
- "ld1rqw { z10.s }, p0/Z, [x23]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ "whilelt p0.s, XZR, x22\n"
+ "ld1rqw { z11.s }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
- ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
- ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+ ".inst 0xc15ba200 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[0]\n"
+ ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ba281 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[0]\n"
+ ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba382 // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[0]\n"
".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aa203 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
+ ".inst 0xc15ba203 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[0]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15ba400 // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[1]\n"
".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
- ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xc15ba601 // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[1]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ba602 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[1]\n"
".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aa603 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xc15ba603 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[1]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc15baa00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
- ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
- ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aaa83 // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+ ".inst 0xc15baa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[2]\n"
+ ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15baa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[2]\n"
+ ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15baa03 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[2]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
- ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xc15bae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xc15baf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z11.s[3]\n"
".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xc15bae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc15aae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xc15bae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[3]\n"
"addvl x26, x26, #16\n"
"33:" // Width 4: Multiply loop: multiply skip
"tbz %x[flags], #1, 34f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z0.s }, p1/Z, [x21]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- "ld1rw { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
- ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
- ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
- ".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xc1a6c818 // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
- ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ "ld1rw { z21.s }, p1/Z, [x21]\n"
+ ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+ "ld1rw { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"b 35f\n"
"34:" // Width 4: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
- ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
- ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+ ".inst 0xa063c324 // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"35:" // Width 4: Output done
"subs x27, x27, #0x4\n"
@@ -540,7 +540,7 @@ void sme2_gemv_fp32_mla_16VL (
"bgt 4b\n"
"36:" // Exit
".inst 0xd503467f // SMSTOP\n"
- "ptrue p1.b\n"
+ "ptrue p8.b\n"
: [N] "+&r" (N)
: [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -549,5 +549,4 @@ void sme2_gemv_fp32_mla_16VL (
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
index f52fbcd57f..76c2bdd71e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,19 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "../std_transforms_sme.hpp"
#include "../bfloat.hpp"
@@ -84,4 +83,4 @@ public:
#undef ARGLIST
-#endif // __aarch64__
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
index 0a394b6413..c6fa11016f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "arm_gemm.hpp"
#include "../../utils.hpp"
@@ -62,7 +62,7 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
break;
}
__asm__ __volatile__(
- "ptrue p2.b\n"
+ "ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
"cntw x10, ALL, MUL #4\n"
"add x28, %x[N], x10\n"
@@ -103,494 +103,494 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"bgt 20f\n"
"beq 12f\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"mov x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 5f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa040c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"6:" // Width 1: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 8f\n"
"7:" // Width 1: Multiply loop: Main loop head
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "sub x21, x21, #0x8\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "addvl x26, x26, #16\n"
- "cmp x21, #0x8\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z10.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa94a // bfcvt z10.h, p2/M, z10.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z10.h, z10.h, z10.h\n"
+ "sub x22, x22, #0x8\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z10.d, z10.d, z16.d\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15ab198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[0]\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ "addvl x26, x26, #16\n"
+ "cmp x22, #0x8\n"
+ ".inst 0xc15ab598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[1]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
"addvl x26, x26, #16\n"
"add x23, x23, #0x20\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xc15ab818 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[2]\n"
".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xc15abf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z10.h[3]\n"
"bgt 7b\n"
"8:" // Width 1: Multiply loop: Single iteration only
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "subs x21, x21, #0x2\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z17.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa31 // bfcvt z17.h, p2/M, z17.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
+ "subs x22, x22, #0x2\n"
+ "uzp1 z17.h, z17.h, z17.h\n"
+ "trn1 z15.d, z15.d, z17.d\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x20\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fb418 // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
"ble 9f\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
"9:" // Width 1: Multiply loop: multiply skip
"tbz %x[flags], #1, 10f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z29.s }, p2/Z, [x21]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
- ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ "ld1rw { z8.s }, p2/Z, [x21]\n"
+ "ld1rw { z26.s }, p2/Z, [x20]\n"
+ ".inst 0xc1bac900 // fclamp { z0.s-z3.s }, z8.s, z26.s\n"
+ ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"b 11f\n"
"10:" // Width 1: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n"
+ ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c324 // st1w { z4.s-z7.s }, p8, [x25]\n"
"addvl x25, x25, #4\n"
"11:" // Width 1: Output done
"b 36f\n"
"12:" // Width 2
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"sub x20, %x[N], x10\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 13f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n"
".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
"b 14f\n"
"13:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"14:" // Width 2: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 16f\n"
"15:" // Width 2: Multiply loop: Main loop head
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "sub x21, x21, #0x8\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "cmp x21, #0x8\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z13.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n"
+ "ld1rqw { z27.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aab7b // bfcvt z27.h, p2/M, z27.s\n"
+ "uzp1 z13.h, z13.h, z13.h\n"
+ "sub x22, x22, #0x8\n"
+ "uzp1 z27.h, z27.h, z27.h\n"
+ "trn1 z13.d, z13.d, z27.d\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ "cmp x22, #0x8\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15db298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[0]\n"
"addvl x26, x26, #16\n"
"add x23, x23, #0x20\n"
- ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15db019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z13.h[0]\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15db698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15db719 // bfdot za.s[x9, 1], { z24.h-z27.h }, z13.h[1]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15db918 // bfdot za.s[x9, 0], { z8.h-z11.h }, z13.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15dba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z13.h[2]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xc15dbc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z13.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xc15dbc99 // bfdot za.s[x9, 1], { z4.h-z7.h }, z13.h[3]\n"
"bgt 15b\n"
"16:" // Width 2: Multiply loop: Single iteration only
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "subs x21, x21, #0x2\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aa8a5 // bfcvt z5.h, p2/M, z5.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
+ "subs x22, x22, #0x2\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "trn1 z15.d, z15.d, z5.d\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x20\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ ".inst 0xc15fb319 // bfdot za.s[x9, 1], { z24.h-z27.h }, z15.h[0]\n"
"ble 17f\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fb798 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[1]\n"
".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ ".inst 0xc15fb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
"ble 17f\n"
".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xc15fbf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xc15fbd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
"17:" // Width 2: Multiply loop: multiply skip
"tbz %x[flags], #1, 18f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z29.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+ "ld1rw { z11.s }, p2/Z, [x21]\n"
".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ ".inst 0xc1bcc974 // fclamp { z20.s-z23.s }, z11.s, z28.s\n"
+ ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+ ".inst 0xc1bcc96c // fclamp { z12.s-z15.s }, z11.s, z28.s\n"
".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"b 19f\n"
"18:" // Width 2: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c320 // st1w { z0.s-z3.s }, p8, [x25, #0x4, MUL VL]\n"
"addvl x25, x25, #8\n"
"19:" // Width 2: Output done
"b 36f\n"
"20:" // Width 3
"mov x20, #0x2\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"msub x20, x10, x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 21f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
- ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n"
+ ".inst 0xa040c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n"
+ ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n"
+ ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
"b 22f\n"
"21:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"22:" // Width 3: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 24f\n"
"23:" // Width 3: Multiply loop: Main loop head
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "sub x21, x21, #0x8\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z14.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ce // bfcvt z14.h, p2/M, z14.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z14.h, z14.h, z14.h\n"
+ "sub x22, x22, #0x8\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z14.d, z14.d, z16.d\n"
".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "cmp x21, #0x8\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "cmp x22, #0x8\n"
+ ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15eb098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z14.h[0]\n"
"add x23, x23, #0x20\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15eb319 // bfdot za.s[x9, 1], { z24.h-z27.h }, z14.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15eb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[0]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
- ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xc15eb518 // bfdot za.s[x9, 0], { z8.h-z11.h }, z14.h[1]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xc15eb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z14.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15eb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[1]\n"
+ ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15eb818 // bfdot za.s[x9, 0], { z0.h-z3.h }, z14.h[2]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15ebb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc15eb81a // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[2]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15ebf18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z14.h[3]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xc15ebf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[3]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+ ".inst 0xc15ebe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[3]\n"
"bgt 23b\n"
"24:" // Width 3: Multiply loop: Single iteration only
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "subs x21, x21, #0x2\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "add x23, x23, #0x20\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
- "ble 25f\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z31.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
+ "subs x22, x22, #0x2\n"
+ "uzp1 z31.h, z31.h, z31.h\n"
+ "trn1 z15.d, z15.d, z31.d\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
- ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+ "add x23, x23, #0x20\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+ ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
+ ".inst 0xc15fb09a // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[0]\n"
"ble 25f\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fb698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ ".inst 0xc15fb699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xc15fb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[1]\n"
+ "addvl x26, x26, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
+ ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb819 // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[2]\n"
+ ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbb1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
"ble 25f\n"
".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xc15fbf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+ ".inst 0xc15fbd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
+ ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fbc9a // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
"25:" // Width 3: Multiply loop: multiply skip
"tbz %x[flags], #1, 26f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z29.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
- ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
- ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc1b2cba4 // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
- ".inst 0xa062c324 // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ "ld1rw { z17.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c73c // st1w { z28.s-z31.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+ ".inst 0xa061c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"b 27f\n"
"26:" // Width 3: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
- ".inst 0xa062c324 // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
"addvl x25, x25, #12\n"
"27:" // Width 3: Output done
"b 36f\n"
"28:" // Width 4
"mov x20, #0x3\n"
"mov x23, %x[A_ptr]\n"
- "lsl x22, %x[K], #0x2\n"
+ "lsl x21, %x[K], #0x2\n"
"msub x20, x10, x20, %x[N]\n"
- "mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n"
"cbz x24, 29f\n"
- ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
- ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
- ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
- ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n"
- ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
- ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ ".inst 0xa040c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24]\n"
+ ".inst 0xc0042d80 // mova za.d[x9, #0], { z12.d-z15.d }\n"
+ ".inst 0xa041c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
+ ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
+ ".inst 0xa043c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+ ".inst 0xc0042e83 // mova za.d[x9, #3], { z20.d-z23.d }\n"
"addvl x24, x24, #16\n"
"b 30f\n"
"29:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"30:" // Width 4: setup done
- "cmp x21, #0x8\n"
+ "cmp x22, #0x8\n"
"ble 32f\n"
"31:" // Width 4: Multiply loop: Main loop head
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "sub x21, x21, #0x8\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
- "cmp x21, #0x8\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z6.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa8c6 // bfcvt z6.h, p2/M, z6.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "sub x22, x22, #0x8\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z6.d, z6.d, z16.d\n"
+ ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+ "cmp x22, #0x8\n"
".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ ".inst 0xc156b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z6.h[0]\n"
"add x23, x23, #0x20\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
- ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b39b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
- ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b79b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
- ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150b99b // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+ ".inst 0xc156b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z6.h[0]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156b19a // bfdot za.s[x9, 2], { z12.h-z15.h }, z6.h[0]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc156b21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[0]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc156b518 // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[1]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156b599 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156b41a // bfdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc156b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[1]\n"
+ ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc156b918 // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[2]\n"
+ ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156b999 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[2]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156b91a // bfdot za.s[x9, 2], { z8.h-z11.h }, z6.h[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc156ba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[2]\n"
".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
- ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xc150bf9b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc156bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z6.h[3]\n"
+ ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc156bd99 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[3]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc156bf1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z6.h[3]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xc156be1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[3]\n"
"bgt 31b\n"
"32:" // Width 4: Multiply loop: Single iteration only
- "whilelt p1.s, XZR, x21\n"
- "whilelt p0.s, x27, x21\n"
- "ld1rqw { z0.s }, p1/Z, [x23]\n"
- ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
- "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
- ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "subs x21, x21, #0x2\n"
- "uzp1 z11.h, z11.h, z11.h\n"
- "trn1 z0.d, z0.d, z11.d\n"
- ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+ "whilelt p1.s, XZR, x22\n"
+ "whilelt p0.s, x27, x22\n"
+ "ld1rqw { z15.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "uzp1 z15.h, z15.h, z15.h\n"
+ "subs x22, x22, #0x2\n"
+ "uzp1 z16.h, z16.h, z16.h\n"
+ "trn1 z15.d, z15.d, z16.d\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
"add x23, x23, #0x20\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
- ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
- ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+ ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb318 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[0]\n"
+ ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xc150b39b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
+ ".inst 0xc15fb21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[0]\n"
"ble 33f\n"
- ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
- ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
- ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
- ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150b79b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
+ ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fb718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[1]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fb619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[1]\n"
+ ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc15fb69a // bfdot za.s[x9, 2], { z20.h-z23.h }, z15.h[1]\n"
+ ".inst 0xa043a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fb41b // bfdot za.s[x9, 3], { z0.h-z3.h }, z15.h[1]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
- "subs x21, x21, #0x2\n"
- ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
- ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ "subs x22, x22, #0x2\n"
+ ".inst 0xc15fba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
- ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150b99b // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+ ".inst 0xc15fba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
+ ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z15.h[2]\n"
"addvl x26, x26, #16\n"
"ble 33f\n"
- ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
- ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
- ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xc15fbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc15fbe19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
- ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc150bf9b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xc15fbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[3]\n"
+ ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc15fbe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[3]\n"
"addvl x26, x26, #16\n"
"33:" // Width 4: Multiply loop: multiply skip
"tbz %x[flags], #1, 34f\n"
"add x21, %x[args_ptr], %[offset_min]\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- "ld1rw { z29.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- "ld1rw { z18.s }, p2/Z, [x20]\n"
- ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
- ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
- ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n"
- ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc1b2cba4 // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
- ".inst 0xa062c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xc1b2cba0 // fclamp { z0.s-z3.s }, z29.s, z18.s\n"
- ".inst 0xa063c320 // st1w { z0.s-z3.s }, p8, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ "ld1rw { z21.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"b 35f\n"
"34:" // Width 4: No activation
- ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
- ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
- ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
- ".inst 0xa062c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x8, MUL VL]\n"
- ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n"
- ".inst 0xa063c320 // st1w { z0.s-z3.s }, p8, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+ ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
"addvl x25, x25, #16\n"
"35:" // Width 4: Output done
"subs x28, x28, #0x4\n"
@@ -598,7 +598,7 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
"bgt 4b\n"
"36:" // Exit
".inst 0xd503467f // SMSTOP\n"
- "ptrue p2.b\n"
+ "ptrue p8.b\n"
: [N] "+&r" (N)
: [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -607,5 +607,4 @@ void sme2_gemv_fp32bf16fp32_dot_16VL (
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
index 4c9f9cff9a..65e4667f88 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,19 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "../std_transforms_sme.hpp"
#define ARGLIST \
@@ -83,4 +82,4 @@ public:
#undef ARGLIST
-#endif // __aarch64__
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
index 26dc0b9dd2..86bd8aeb04 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "arm_gemm.hpp"
#include "../../utils.hpp"
@@ -35,11 +35,9 @@ namespace arm_gemm {
void sme2_gemv_s8qa_dot_16VL (
const int8_t *A_ptr, const int8_t *B_ptr, int8_t *output_ptr,
size_t N, size_t K,
- const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
)
{
- ARM_COMPUTE_UNUSED(col_base);
-
struct KernelArgs {
const int8_t *B_ptr = {};
size_t output_offset = {};
@@ -52,7 +50,7 @@ void sme2_gemv_s8qa_dot_16VL (
flags |= 0x20;
}
__asm__ __volatile__(
- "ptrue p2.b\n"
+ "ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
"cntw x28, ALL, MUL #4\n"
"add x27, %x[N], x28\n"
@@ -84,8 +82,8 @@ void sme2_gemv_s8qa_dot_16VL (
".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n"
"3:" // RHS prefetch exit
"mov x24, %x[col_bias]\n"
- "mov z26.s, #0x0\n"
- "mov z24.b, #0x1\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"4:" // Column loop
"cmp x27, #0x4\n"
@@ -94,404 +92,404 @@ void sme2_gemv_s8qa_dot_16VL (
"bgt 24f\n"
"beq 14f\n"
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "mov x20, %x[N]\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x20, %x[N]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 5f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"6:" // Width 1: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 9f\n"
"7:" // Width 1: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- "addvl x26, x26, #16\n"
".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xc151b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 8f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"8:" // Width 1: Multiply loop: unique 1: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 7b\n"
"9:" // Width 1: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 10f\n"
".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b920 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bd20 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"10:" // Width 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"11:" // Width 1: Multiply loop: unique 2: skip row sum
"tbnz %x[flags], #31, 12f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z26.s }, p2/Z, [x21]\n"
+ "neg z26.s, p2/M, z26.s\n"
"whilelt p0.s, XZR, x20\n"
- "saddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z26.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"12:" // Width 1: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p1, [x25]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ "ld1rw { z30.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a2ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1bece0c // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z19.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z19.b\n"
+ "st1b { z12.b }, p1, [x25]\n"
"addvl x25, x25, #1\n"
"13:" // Width 1: Output done
"b 44f\n"
"14:" // Width 2
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "sub x20, %x[N], x28\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "sub x20, %x[N], x28\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 15f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042f01 // mova za.d[x9, #1], { z24.d-z27.d }\n"
"b 16f\n"
"15:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"16:" // Width 2: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 19f\n"
"17:" // Width 2: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b6a1 // sdot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bca0 // sdot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd21 // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 18f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"18:" // Width 2: Multiply loop: unique 3: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 17b\n"
"19:" // Width 2: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xc151b320 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 20f\n"
".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd21 // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"20:" // Width 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 21f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"21:" // Width 2: Multiply loop: unique 4: skip row sum
"tbnz %x[flags], #31, 22f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "saddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"22:" // Width 2: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z5.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z9.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x25, #1, MUL VL]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a5aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1a9ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ ".inst 0xc1a9ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ ".inst 0xc1b5ce18 // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+ ".inst 0xc1b5ce00 // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z9.h, z26.h, z27.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z26.h, z2.h, z3.h\n"
+ "uzp1 z24.b, z24.b, z9.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z0.b, z0.b, z26.b\n"
+ "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
"addvl x25, x25, #2\n"
"23:" // Width 2: Output done
"b 44f\n"
"24:" // Width 3
"mov x20, #0x2\n"
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "msub x20, x28, x20, %x[N]\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "msub x20, x28, x20, %x[N]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 25f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
- ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
+ ".inst 0xa042c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n"
"b 26f\n"
"25:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"26:" // Width 3: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 29f\n"
"27:" // Width 3: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xc151b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
- ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b6a2 // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b920 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xc151b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bca1 // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 28f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"28:" // Width 3: Multiply loop: unique 5: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 27b\n"
"29:" // Width 3: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b222 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b720 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xc151b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151baa2 // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bda2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"30:" // Width 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 31f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"31:" // Width 3: Multiply loop: unique 6: skip row sum
"tbnz %x[flags], #31, 32f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "saddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"32:" // Width 3: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
- ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
- ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
- ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a3ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a3ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1a3ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc1a0ce08 // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+ ".inst 0xc1a0ce04 // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0xc1a0ce0c // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+ "uzp1 z18.h, z10.h, z11.h\n"
+ "uzp1 z4.h, z4.h, z5.h\n"
+ "uzp1 z17.h, z6.h, z7.h\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z0.h, z0.h, z1.h\n"
- "uzp1 z1.h, z2.h, z3.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
- "uzp1 z0.b, z0.b, z1.b\n"
- "st1b { z0.b }, p1, [x25, #2, MUL VL]\n"
+ "uzp1 z16.h, z14.h, z15.h\n"
+ "uzp1 z8.b, z8.b, z18.b\n"
+ "st1b { z8.b }, p2, [x25]\n"
+ "uzp1 z4.b, z4.b, z17.b\n"
+ "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z12.b, z12.b, z16.b\n"
+ "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
"addvl x25, x25, #3\n"
"33:" // Width 3: Output done
"b 44f\n"
"34:" // Width 4
"mov x20, #0x3\n"
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "msub x20, x28, x20, %x[N]\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "msub x20, x28, x20, %x[N]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 35f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
- ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
+ ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
".inst 0xa043c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
"addvl x24, x24, #16\n"
@@ -499,165 +497,165 @@ void sme2_gemv_s8qa_dot_16VL (
"35:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"36:" // Width 4: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 39f\n"
"37:" // Width 4: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xc151b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
- ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
- ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153bba3 // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ ".inst 0xc151b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b623 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151ba23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bda1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0428359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bf22 // sdot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0438345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151bca3 // sdot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 38f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"38:" // Width 4: Multiply loop: unique 7: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 37b\n"
"39:" // Width 4: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
- ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0428349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b122 // sdot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b223 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
- ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
- ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b621 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b5a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0438355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b6a3 // sdot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
- ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153bba3 // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151ba23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151be20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"40:" // Width 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 41f\n"
- "sdot z26.s, z3.b, z24.b\n"
+ "sdot z28.s, z1.b, z29.b\n"
"41:" // Width 4: Multiply loop: unique 8: skip row sum
"tbnz %x[flags], #31, 42f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "saddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "saddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"42:" // Width 4: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z11.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
"ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
- ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
- ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
- ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n"
- ".inst 0xc1a5ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
- ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1abac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xc1abac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+ ".inst 0xc1abac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc0062c6c // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+ ".inst 0xc1abac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1rw { z31.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- ".inst 0xc1a6ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
- ".inst 0xc1b0cea8 // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
+ ".inst 0xc1bfcc78 // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc70 // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ ".inst 0xc1bfcc74 // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc6c // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z18.h, z18.h, z19.h\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "uzp1 z17.h, z22.h, z23.h\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z0.h, z0.h, z1.h\n"
- "uzp1 z1.h, z2.h, z3.h\n"
- "uzp1 z8.h, z8.h, z9.h\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
- "uzp1 z0.b, z0.b, z1.b\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z0.b }, p2, [x25, #2, MUL VL]\n"
- "st1b { z8.b }, p1, [x25, #3, MUL VL]\n"
+ "uzp1 z30.h, z14.h, z15.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z16.b, z16.b, z18.b\n"
+ "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z20.b, z20.b, z17.b\n"
+ "uzp1 z12.b, z12.b, z30.b\n"
+ "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+ "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
"43:" // Width 4: Output done
"subs x27, x27, #0x4\n"
@@ -665,7 +663,7 @@ void sme2_gemv_s8qa_dot_16VL (
"bgt 4b\n"
"44:" // Exit
".inst 0xd503467f // SMSTOP\n"
- "ptrue p2.b\n"
+ "ptrue p8.b\n"
: [N] "+&r" (N), [flags] "+&r" (flags)
: [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -674,5 +672,4 @@ void sme2_gemv_s8qa_dot_16VL (
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
index e15b95445e..46d8c4439b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,19 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "../std_transforms_sme.hpp"
#define ARGLIST \
@@ -83,4 +82,4 @@ public:
#undef ARGLIST
-#endif // __aarch64__
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
index dfdc4ea289..093feee6ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
#include "arm_gemm.hpp"
#include "../../utils.hpp"
@@ -35,11 +35,9 @@ namespace arm_gemm {
void sme2_gemv_u8qa_dot_16VL (
const uint8_t *A_ptr, const uint8_t *B_ptr, uint8_t *output_ptr,
size_t N, size_t K,
- const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
)
{
- ARM_COMPUTE_UNUSED(col_base);
-
struct KernelArgs {
const uint8_t *B_ptr = {};
size_t output_offset = {};
@@ -52,7 +50,7 @@ void sme2_gemv_u8qa_dot_16VL (
flags |= 0x20;
}
__asm__ __volatile__(
- "ptrue p2.b\n"
+ "ptrue p8.b\n"
".inst 0xd503477f // SMSTART ZA\n"
"cntw x28, ALL, MUL #4\n"
"add x27, %x[N], x28\n"
@@ -84,8 +82,8 @@ void sme2_gemv_u8qa_dot_16VL (
".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n"
"3:" // RHS prefetch exit
"mov x24, %x[col_bias]\n"
- "mov z26.s, #0x0\n"
- "mov z24.b, #0x1\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.b, #0x1\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"4:" // Column loop
"cmp x27, #0x4\n"
@@ -94,404 +92,404 @@ void sme2_gemv_u8qa_dot_16VL (
"bgt 24f\n"
"beq 14f\n"
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "mov x20, %x[N]\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "mov x20, %x[N]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 5f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
"b 6f\n"
"5:" // Width 1: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"6:" // Width 1: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 9f\n"
"7:" // Width 1: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- "addvl x26, x26, #16\n"
".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xc151b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 8f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"8:" // Width 1: Multiply loop: unique 1: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 7b\n"
"9:" // Width 1: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 10f\n"
".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b930 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 10f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bd30 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"10:" // Width 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"11:" // Width 1: Multiply loop: unique 2: skip row sum
"tbnz %x[flags], #31, 12f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z26.s }, p2/Z, [x21]\n"
+ "neg z26.s, p2/M, z26.s\n"
"whilelt p0.s, XZR, x20\n"
- "uaddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z26.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"12:" // Width 1: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p1, [x25]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ "ld1rw { z30.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a2ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1bece0c // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z19.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z19.b\n"
+ "st1b { z12.b }, p1, [x25]\n"
"addvl x25, x25, #1\n"
"13:" // Width 1: Output done
"b 44f\n"
"14:" // Width 2
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "sub x20, %x[N], x28\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "sub x20, %x[N], x28\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 15f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042f01 // mova za.d[x9, #1], { z24.d-z27.d }\n"
"b 16f\n"
"15:" // Width 2: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"16:" // Width 2: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 19f\n"
"17:" // Width 2: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b331 // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b6b1 // udot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bcb0 // udot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd31 // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 18f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"18:" // Width 2: Multiply loop: unique 3: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 17b\n"
"19:" // Width 2: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xc151b330 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 20f\n"
".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 20f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bd31 // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"20:" // Width 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 21f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"21:" // Width 2: Multiply loop: unique 4: skip row sum
"tbnz %x[flags], #31, 22f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "uaddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"22:" // Width 2: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z5.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z9.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x25, #1, MUL VL]\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a5aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1a9ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ ".inst 0xc1a9ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ ".inst 0xc1b5ce18 // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+ ".inst 0xc1b5ce00 // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z9.h, z26.h, z27.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z26.h, z2.h, z3.h\n"
+ "uzp1 z24.b, z24.b, z9.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z0.b, z0.b, z26.b\n"
+ "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
"addvl x25, x25, #2\n"
"23:" // Width 2: Output done
"b 44f\n"
"24:" // Width 3
"mov x20, #0x2\n"
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "msub x20, x28, x20, %x[N]\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "msub x20, x28, x20, %x[N]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 25f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
- ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
+ ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n"
+ ".inst 0xa042c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n"
"b 26f\n"
"25:" // Width 3: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"26:" // Width 3: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 29f\n"
"27:" // Width 3: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xc151b230 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
- ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b5b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b6b2 // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b930 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xc151b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bcb1 // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151be32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 28f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"28:" // Width 3: Multiply loop: unique 5: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 27b\n"
"29:" // Width 3: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b232 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b730 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xc151b632 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bab2 // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 30f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bdb2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"30:" // Width 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 31f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"31:" // Width 3: Multiply loop: unique 6: skip row sum
"tbnz %x[flags], #31, 32f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "uaddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"32:" // Width 3: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
- ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
- ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
- ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a3ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a3ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1a3ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc1a0ce08 // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+ ".inst 0xc1a0ce04 // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0xc1a0ce0c // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+ "uzp1 z18.h, z10.h, z11.h\n"
+ "uzp1 z4.h, z4.h, z5.h\n"
+ "uzp1 z17.h, z6.h, z7.h\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z0.h, z0.h, z1.h\n"
- "uzp1 z1.h, z2.h, z3.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
- "uzp1 z0.b, z0.b, z1.b\n"
- "st1b { z0.b }, p1, [x25, #2, MUL VL]\n"
+ "uzp1 z16.h, z14.h, z15.h\n"
+ "uzp1 z8.b, z8.b, z18.b\n"
+ "st1b { z8.b }, p2, [x25]\n"
+ "uzp1 z4.b, z4.b, z17.b\n"
+ "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z12.b, z12.b, z16.b\n"
+ "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
"addvl x25, x25, #3\n"
"33:" // Width 3: Output done
"b 44f\n"
"34:" // Width 4
"mov x20, #0x3\n"
"mov x23, %x[A_ptr]\n"
- "mov x22, %x[K]\n"
- "msub x20, x28, x20, %x[N]\n"
"mov x21, %x[K]\n"
- ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n"
+ "msub x20, x28, x20, %x[N]\n"
+ "mov x22, %x[K]\n"
+ ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
"whilelt p1.b, XZR, x20\n"
"cbz x24, 35f\n"
- ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
- ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
- ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
- ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
- ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+ ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
+ ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n"
".inst 0xa043c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
"addvl x24, x24, #16\n"
@@ -499,165 +497,165 @@ void sme2_gemv_u8qa_dot_16VL (
"35:" // Width 4: no bias
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"36:" // Width 4: setup done
- "cmp x21, #0x10\n"
+ "cmp x22, #0x10\n"
"ble 39f\n"
"37:" // Width 4: Multiply loop: Main loop head
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x23, x23, #0x10\n"
".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xc151b230 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
- ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
- "addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
- ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153bbb3 // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ ".inst 0xc151b632 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b633 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153be33 // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151ba33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+ "addvl x26, x26, #16\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151bdb1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+ ".inst 0xa0428359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151bf32 // udot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+ ".inst 0xa0438345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151bcb3 // udot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"tbnz %x[flags], #31, 38f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"38:" // Width 4: Multiply loop: unique 7: skip row sum
- "sub x21, x21, #0x10\n"
- "cmp x21, #0x10\n"
+ "sub x22, x22, #0x10\n"
+ "cmp x22, #0x10\n"
"bgt 37b\n"
"39:" // Width 4: Multiply loop: Single iteration only
- "whilelt p0.b, XZR, x21\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "whilelt p0.b, XZR, x22\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
"add x23, x23, #0x10\n"
- ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
- ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+ ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b331 // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+ ".inst 0xa0428349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b132 // udot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b233 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
"addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
- ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
- ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
- ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151b631 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+ ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151b5b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+ ".inst 0xa0438355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151b6b3 // udot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
"addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- "subs x21, x21, #0x4\n"
- ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
- ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
- ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
- ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153bbb3 // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x22, x22, #0x4\n"
+ ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151ba31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+ ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ ".inst 0xc151ba33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
"addvl x26, x26, #16\n"
"ble 40f\n"
- ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
- ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
- ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
- ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xc151be30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+ ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xc151be31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
- ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151be32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
- ".inst 0xc153be33 // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xc151be33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
"addvl x26, x26, #16\n"
"40:" // Width 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 41f\n"
- "udot z26.s, z3.b, z24.b\n"
+ "udot z28.s, z1.b, z29.b\n"
"41:" // Width 4: Multiply loop: unique 8: skip row sum
"tbnz %x[flags], #31, 42f\n"
"add x21, %x[qp], %[b_offset]\n"
"mov x20, #0x4\n"
- "ld1rw { z10.s }, p2/Z, [x21]\n"
- "neg z10.s, p2/M, z10.s\n"
+ "ld1rw { z16.s }, p2/Z, [x21]\n"
+ "neg z16.s, p2/M, z16.s\n"
"whilelt p0.s, XZR, x20\n"
- "uaddv d26, p0, z26.s\n"
- "mov z26.s, z26.s[0]\n"
- "mul z26.s, p2/M, z26.s, z10.s\n"
+ "uaddv d28, p0, z28.s\n"
+ "mov z28.s, z28.s[0]\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"42:" // Width 4: skip row sum fixup
- ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
+ "ld1rw { z11.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z7.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[c_offset]\n"
"add x21, %x[qp], %[minval]\n"
- ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n"
"ld1rw { z6.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[maxval]\n"
- ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
- "ld1rw { z21.s }, p2/Z, [x21]\n"
- ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
- ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
- ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
- ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
- ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
- ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
- ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n"
- ".inst 0xc1a5ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1rw { z16.s }, p2/Z, [x20]\n"
- ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
- ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n"
+ "ld1rw { z3.s }, p2/Z, [x21]\n"
+ ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+ ".inst 0xc1abac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+ ".inst 0xc1abac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+ ".inst 0xc1abac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc0062c6c // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+ ".inst 0xc1abac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1rw { z31.s }, p2/Z, [x20]\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- ".inst 0xc1a6ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
- ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
- ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
- "uzp1 z28.h, z28.h, z29.h\n"
- ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
- ".inst 0xc1b0cea8 // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
+ ".inst 0xc1bfcc78 // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc70 // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ ".inst 0xc1bfcc74 // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+ ".inst 0xc1bfcc6c // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z18.h, z18.h, z19.h\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "uzp1 z17.h, z22.h, z23.h\n"
"uzp1 z12.h, z12.h, z13.h\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z0.h, z0.h, z1.h\n"
- "uzp1 z1.h, z2.h, z3.h\n"
- "uzp1 z8.h, z8.h, z9.h\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p2, [x25]\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
- "uzp1 z0.b, z0.b, z1.b\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z0.b }, p2, [x25, #2, MUL VL]\n"
- "st1b { z8.b }, p1, [x25, #3, MUL VL]\n"
+ "uzp1 z30.h, z14.h, z15.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p2, [x25]\n"
+ "uzp1 z16.b, z16.b, z18.b\n"
+ "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+ "uzp1 z20.b, z20.b, z17.b\n"
+ "uzp1 z12.b, z12.b, z30.b\n"
+ "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+ "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
"43:" // Width 4: Output done
"subs x27, x27, #0x4\n"
@@ -665,7 +663,7 @@ void sme2_gemv_u8qa_dot_16VL (
"bgt 4b\n"
"44:" // Exit
".inst 0xd503467f // SMSTOP\n"
- "ptrue p2.b\n"
+ "ptrue p8.b\n"
: [N] "+&r" (N), [flags] "+&r" (flags)
: [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -674,5 +672,4 @@ void sme2_gemv_u8qa_dot_16VL (
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
index 37eb63d898..edfb362aab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "../bfloat.hpp"
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
- cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
index c6eb858ade..8105300cb7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -113,12 +112,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
@@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z21.s, #1.0\n"
+ "fmov z6.s, #1.0\n"
".inst 0xa009c29d // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
- ".inst 0x809c02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
- ".inst 0x809d02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
- ".inst 0x809e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
- ".inst 0x809f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+ ".inst 0x809c00c0 // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+ ".inst 0x809d00c1 // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+ ".inst 0x809e00c2 // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+ ".inst 0x809f00c3 // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x9\n"
"mov x21, x10\n"
@@ -166,75 +165,75 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"madd x23, x9, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- "ld1h { z0.h }, p0/Z, [x26]\n"
- ".inst 0xa140a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
- "ld1h { z13.h }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa141a6ea // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1h { z12.h }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142a6eb // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1h { z26.h }, p0/Z, [x26, #3, MUL VL]\n"
+ "ld1h { z28.h }, p0/Z, [x26]\n"
+ ".inst 0xa040a6e9 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+ "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042a6e5 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa143a6f8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+ ".inst 0x81880380 // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
"subs x22, x22, #0x1\n"
- ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
- ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
- ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
- "ld1h { z0.h }, p0/Z, [x26]\n"
- ".inst 0x818201a0 // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
- ".inst 0xa140a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
- ".inst 0x818601a1 // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
- ".inst 0x818a01a2 // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
- ".inst 0x818e01a3 // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
- "ld1h { z13.h }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0x81830180 // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
- ".inst 0xa141a6ea // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x81870181 // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
- ".inst 0x818b0182 // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
- ".inst 0x818f0183 // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
- "ld1h { z12.h }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142a6eb // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0x81900340 // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
- ".inst 0x81940341 // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
- ".inst 0x81980342 // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
- ".inst 0x819c0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
- "ld1h { z26.h }, p0/Z, [x26, #3, MUL VL]\n"
+ ".inst 0x81890381 // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+ ".inst 0x818a0382 // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+ ".inst 0x818b0383 // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+ "ld1h { z28.h }, p0/Z, [x26]\n"
+ ".inst 0x818c02c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+ ".inst 0xa040a6e9 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+ ".inst 0x818d02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+ ".inst 0x818e02c2 // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+ ".inst 0x818f02c3 // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+ "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x818403c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+ ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x818503c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+ ".inst 0x818603c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+ ".inst 0x818703c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+ "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042a6e5 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0x81930280 // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
+ ".inst 0x81970281 // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
+ ".inst 0x819b0282 // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
+ ".inst 0x819f0283 // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
+ "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa143a6f8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
- ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
- ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
- ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
- ".inst 0x818201a0 // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
- ".inst 0x818601a1 // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
- ".inst 0x818a01a2 // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
- ".inst 0x818e01a3 // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
- ".inst 0x81830180 // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
- ".inst 0x81870181 // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
- ".inst 0x818b0182 // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
- ".inst 0x818f0183 // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
- ".inst 0x81900340 // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
- ".inst 0x81940341 // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
- ".inst 0x81980342 // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
- ".inst 0x819c0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
+ ".inst 0x81880380 // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
+ ".inst 0x81890381 // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+ ".inst 0x818a0382 // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+ ".inst 0x818b0383 // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+ ".inst 0x818c02c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+ ".inst 0x818d02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+ ".inst 0x818e02c2 // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+ ".inst 0x818f02c3 // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+ ".inst 0x818403c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+ ".inst 0x818503c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+ ".inst 0x818603c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+ ".inst 0x818703c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+ ".inst 0x81930280 // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
+ ".inst 0x81970281 // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
+ ".inst 0x819b0282 // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
+ ".inst 0x819f0283 // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1h { z0.h }, p0/Z, [x26]\n"
+ "ld1h { z8.h }, p0/Z, [x26]\n"
"subs x21, x21, #0x1\n"
"addvl x26, x26, #1\n"
- ".inst 0xa140a6f3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
+ ".inst 0xa140a6e3 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
- ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
- ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
- ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+ ".inst 0x81830100 // bfmopa za0.s, p0/M, p0/M, z8.h, z3.h\n"
+ ".inst 0x81870101 // bfmopa za1.s, p0/M, p0/M, z8.h, z7.h\n"
+ ".inst 0x818b0102 // bfmopa za2.s, p0/M, p0/M, z8.h, z11.h\n"
+ ".inst 0x818f0103 // bfmopa za3.s, p0/M, p0/M, z8.h, z15.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x15, #1, 14f\n"
@@ -242,25 +241,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xa041c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa042c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13]\n"
+ ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
"addvl x14, x14, #16\n"
- ".inst 0xa061c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -268,15 +267,15 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
- ".inst 0xa061c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa061c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 13b\n"
@@ -314,18 +313,18 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x24, x24, x22\n"
@@ -334,66 +333,66 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
"18:" // Store to output array: Skip activation: End
"cntw x20\n"
"cmp x24, x20\n"
- "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x20, x24, x20, LT\n"
"lsr x21, x20, #0x2\n"
- "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
- ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
- ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"add x12, x12, #0x4\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
- ".inst 0xa160c323 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
+ ".inst 0xa160c333 // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"blt 19b\n"
"20:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 21f\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
- ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
- ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 21f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 21f\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"21:" // Store to output array: Accumulator row 0 oddments: End
"22:" // Store to output array: End
"tbz x15, #0, 24f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"23:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x14, x14, #16\n"
@@ -417,4 +416,3 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
index 89c79cfb0a..ca7b0573fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "../bfloat.hpp"
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
- cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
index b63f2110ff..20c1de9418 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z21.s, #1.0\n"
- ".inst 0xa00a428f // ldnt1w { z14.s-z15.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0x808e02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
- ".inst 0x808f02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
- ".inst 0x808e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
- ".inst 0x808f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+ "fmov z12.s, #1.0\n"
+ ".inst 0xa10a4289 // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x80810180 // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890181 // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+ ".inst 0x80810182 // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890183 // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -166,75 +165,75 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1402767 // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0xa14026ff // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x23]\n"
- ".inst 0xa0412768 // ld1h { z8.h-z9.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04126e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa1422772 // ld1h { z18.h, z26.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa04226f1 // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1432776 // ld1h { z22.h, z30.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa0402772 // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa04026e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0412764 // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04126fb // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa042276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04226f5 // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0432766 // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14326ec // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04326e9 // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+ ".inst 0x81820240 // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
"subs x22, x22, #0x1\n"
- ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
- ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
- ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
- ".inst 0xa1402767 // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n"
- ".inst 0x81820100 // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
- ".inst 0xa14026ff // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x23]\n"
- ".inst 0x81830101 // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
- ".inst 0x81820122 // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
- ".inst 0x81830123 // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
- ".inst 0xa0412768 // ld1h { z8.h-z9.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0x81900240 // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
- ".inst 0xa04126e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0x81910241 // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
- ".inst 0x81900342 // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
- ".inst 0x81910343 // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
- ".inst 0xa1422772 // ld1h { z18.h, z26.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa04226f1 // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0x818402c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
- ".inst 0x818c02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
- ".inst 0x818403c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
- ".inst 0x818c03c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
- ".inst 0xa1432776 // ld1h { z22.h, z30.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0x81830241 // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+ ".inst 0x81820262 // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+ ".inst 0x81830263 // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+ ".inst 0xa0402772 // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+ ".inst 0x819a0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+ ".inst 0xa04026e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+ ".inst 0x819b0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+ ".inst 0x819a00a2 // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+ ".inst 0x819b00a3 // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+ ".inst 0xa0412764 // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0x81940140 // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+ ".inst 0xa04126fb // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0x81950141 // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+ ".inst 0x81940162 // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+ ".inst 0x81950163 // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+ ".inst 0xa042276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04226f5 // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x818800c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+ ".inst 0x818900c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+ ".inst 0x818800e2 // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+ ".inst 0x818900e3 // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
+ ".inst 0xa0432766 // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14326ec // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04326e9 // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
- ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
- ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
- ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
- ".inst 0x81820100 // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
- ".inst 0x81830101 // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
- ".inst 0x81820122 // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
- ".inst 0x81830123 // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
- ".inst 0x81900240 // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
- ".inst 0x81910241 // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
- ".inst 0x81900342 // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
- ".inst 0x81910343 // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
- ".inst 0x818402c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
- ".inst 0x818c02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
- ".inst 0x818403c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
- ".inst 0x818c03c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
+ ".inst 0x81820240 // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
+ ".inst 0x81830241 // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+ ".inst 0x81820262 // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+ ".inst 0x81830263 // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+ ".inst 0x819a0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+ ".inst 0x819b0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+ ".inst 0x819a00a2 // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+ ".inst 0x819b00a3 // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+ ".inst 0x81940140 // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+ ".inst 0x81950141 // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+ ".inst 0x81940162 // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+ ".inst 0x81950163 // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+ ".inst 0x818800c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+ ".inst 0x818900c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+ ".inst 0x818800e2 // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+ ".inst 0x818900e3 // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa1402767 // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040277e // ld1h { z30.h-z31.h }, pn9.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
- ".inst 0xa14026f7 // ld1h { z23.h, z31.h }, pn9.b/Z, [x23]\n"
+ ".inst 0xa14026e5 // ld1h { z5.h, z13.h }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
- ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
- ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
- ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+ ".inst 0x818503c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z5.h\n"
+ ".inst 0x818d03c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z13.h\n"
+ ".inst 0x818503e2 // bfmopa za2.s, p0/M, p0/M, z31.h, z5.h\n"
+ ".inst 0x818d03e3 // bfmopa za3.s, p0/M, p0/M, z31.h, z13.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -243,24 +242,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 30f\n"
@@ -268,16 +267,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa061c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
@@ -312,16 +311,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -349,16 +348,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -367,44 +366,44 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"21:" // Store to output array: Skip activation: End
"cntw x23\n"
"cmp x25, x23\n"
- "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
- "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
+ ".inst 0xa1604357 // st1w { z23.s, z31.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 24f\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"24:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -418,8 +417,8 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"25:" // Store to output array: Accumulator row 1 loop
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
@@ -435,8 +434,8 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"cbz x20, 27f\n"
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
@@ -452,14 +451,14 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -483,4 +482,3 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
index 0d407e0cba..7b31d6d2db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "../bfloat.hpp"
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
- cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
index a51b3db4b0..70c94d32a3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z8.s, #1.0\n"
- "ldnt1w { z27.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0x809b2500 // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
- ".inst 0x809b2501 // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
- ".inst 0x809b2502 // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
- ".inst 0x809b2503 // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+ "fmov z11.s, #1.0\n"
+ "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x808d2560 // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2561 // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2563 // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -166,75 +165,75 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa040a364 // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n"
- "ldnt1h { z29.h }, p1/Z, [x23]\n"
- ".inst 0xa041a36c // ld1h { z12.h-z15.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ldnt1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa042a360 // ld1h { z0.h-z3.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa143a372 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa140a360 // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+ "ldnt1h { z19.h }, p1/Z, [x23]\n"
+ ".inst 0xa141a371 // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa142a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa143a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+ ".inst 0x81932400 // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
"subs x22, x22, #0x1\n"
- ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
- ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
- ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
- ".inst 0xa040a364 // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n"
- ".inst 0x81972580 // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
- "ldnt1h { z29.h }, p1/Z, [x23]\n"
- ".inst 0x819725a1 // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
- ".inst 0x819725c2 // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
- ".inst 0x819725e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
- ".inst 0xa041a36c // ld1h { z12.h-z15.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0x81952400 // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
- "ldnt1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0x81952421 // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
- ".inst 0x81952442 // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
- ".inst 0x81952463 // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
- ".inst 0xa042a360 // ld1h { z0.h-z3.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0x819b2640 // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
- ".inst 0x819b26c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
- ".inst 0x819b2742 // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
- ".inst 0x819b27c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
- ".inst 0xa143a372 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0x81932481 // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+ ".inst 0x81932502 // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+ ".inst 0x81932583 // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+ ".inst 0xa140a360 // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+ ".inst 0x81962620 // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+ "ldnt1h { z19.h }, p1/Z, [x23]\n"
+ ".inst 0x819626a1 // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+ ".inst 0x81962722 // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+ ".inst 0x819627a3 // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+ ".inst 0xa141a371 // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0x81972600 // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+ "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0x81972681 // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+ ".inst 0x81972702 // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+ ".inst 0x81972783 // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+ ".inst 0xa142a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x81822460 // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+ ".inst 0x818224e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+ ".inst 0x81822562 // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+ ".inst 0x818225e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
+ ".inst 0xa143a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
- ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
- ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
- ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
- ".inst 0x81972580 // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
- ".inst 0x819725a1 // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
- ".inst 0x819725c2 // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
- ".inst 0x819725e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
- ".inst 0x81952400 // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
- ".inst 0x81952421 // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
- ".inst 0x81952442 // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
- ".inst 0x81952463 // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
- ".inst 0x819b2640 // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
- ".inst 0x819b26c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
- ".inst 0x819b2742 // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
- ".inst 0x819b27c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
+ ".inst 0x81932400 // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
+ ".inst 0x81932481 // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+ ".inst 0x81932502 // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+ ".inst 0x81932583 // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+ ".inst 0x81962620 // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+ ".inst 0x819626a1 // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+ ".inst 0x81962722 // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+ ".inst 0x819627a3 // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+ ".inst 0x81972600 // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+ ".inst 0x81972681 // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+ ".inst 0x81972702 // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+ ".inst 0x81972783 // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+ ".inst 0x81822460 // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+ ".inst 0x818224e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+ ".inst 0x81822562 // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+ ".inst 0x818225e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040a364 // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n"
+ ".inst 0xa140a373 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1h { z29.h }, p1/Z, [x23]\n"
+ "ld1h { z11.h }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
- ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
- ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
- ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+ ".inst 0x818b2660 // bfmopa za0.s, p1/M, p1/M, z19.h, z11.h\n"
+ ".inst 0x818b26e1 // bfmopa za1.s, p1/M, p1/M, z23.h, z11.h\n"
+ ".inst 0x818b2762 // bfmopa za2.s, p1/M, p1/M, z27.h, z11.h\n"
+ ".inst 0x818b27e3 // bfmopa za3.s, p1/M, p1/M, z31.h, z11.h\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -242,25 +241,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa042c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa060c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
"addvl x15, x15, #16\n"
".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 42f\n"
@@ -269,15 +268,15 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
".inst 0xa060c1cc // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa061c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa062c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 42f\n"
@@ -296,16 +295,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
@@ -331,30 +330,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Skip activation: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ "st1w { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 18b\n"
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- "st1w { z4.s }, p0, [x26]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ "st1w { z24.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1w { z5.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- "st1w { z6.s }, p0, [x26]\n"
+ "st1w { z26.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -366,30 +365,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Skip activation: Accumulator row 2 loop
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 21b\n"
"22:" // Store to output array: Skip activation: Accumulator row 2 oddments
"cbz x20, 23f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- "st1w { z20.s }, p0, [x26]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ "st1w { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
"subs x20, x20, #0x1\n"
- "st1w { z21.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
- "st1w { z22.s }, p0, [x26]\n"
+ "st1w { z14.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
@@ -401,30 +400,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"and x20, x22, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Skip activation: Accumulator row 3 loop
- ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
- "st1w { z4.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z5.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z6.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z7.s }, p0, [x26]\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 24b\n"
"25:" // Store to output array: Skip activation: Accumulator row 3 oddments
"cbz x20, 26f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 26f\n"
"subs x20, x20, #0x1\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 26f\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"26:" // Store to output array: Skip activation: Accumulator row 3 oddments: End
"subs x25, x25, x22\n"
@@ -433,40 +432,40 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"27:" // Store to output array: Skip activation: End
"cntw x23\n"
"cmp x25, x23\n"
- "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
- "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 29f\n"
"28:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
- "st1w { z20.s }, p0, [x26]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z21.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z22.s }, p0, [x26]\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z23.s }, p0, [x26]\n"
+ "st1w { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 28b\n"
"29:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 30f\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb28 // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
- "st1w { z8.s }, p0, [x26]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
"subs x20, x20, #0x1\n"
- "st1w { z9.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
- "st1w { z10.s }, p0, [x26]\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"30:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -478,24 +477,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"and x20, x22, #0x3\n"
"cbz x21, 32f\n"
"31:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 31b\n"
"32:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 33f\n"
".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 33f\n"
@@ -516,7 +515,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"cbz x21, 35f\n"
"34:" // Store to output array: Accumulator row 2 loop
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1w { z17.s }, p0, [x26]\n"
@@ -532,7 +531,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"cbz x20, 36f\n"
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 36f\n"
@@ -552,24 +551,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
"and x20, x20, #0x3\n"
"cbz x21, 38f\n"
"37:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
- "st1w { z20.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z21.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z22.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z23.s }, p0, [x26]\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 37b\n"
"38:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 39f\n"
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 39f\n"
@@ -588,10 +587,10 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -615,4 +614,3 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
index 7777349b42..bf3de2118e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 1, 4, 1> transforms = {};
- cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
index dd99387c5e..97be758bd6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -113,12 +112,12 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
@@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z21.s, #1.0\n"
+ "fmov z6.s, #1.0\n"
".inst 0xa009c29d // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
- ".inst 0x809c02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
- ".inst 0x809d02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
- ".inst 0x809e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
- ".inst 0x809f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+ ".inst 0x809c00c0 // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+ ".inst 0x809d00c1 // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+ ".inst 0x809e00c2 // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+ ".inst 0x809f00c3 // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x9\n"
"mov x21, x10\n"
@@ -164,75 +163,75 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"madd x21, x9, x20, x21\n" // bptr = B + n * kstride_bytes
"cbz x23, 8f\n"
"subs x23, x23, #0x1\n"
- "ld1w { z0.s }, p0/Z, [x26]\n"
- ".inst 0xa140c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n"
- "ld1w { z13.s }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0xa141c6aa // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
- "ld1w { z12.s }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142c6ab // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
- "ld1w { z26.s }, p0/Z, [x26, #3, MUL VL]\n"
+ "ld1w { z28.s }, p0/Z, [x26]\n"
+ ".inst 0xa040c6a9 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+ "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0xa041c6ad // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042c6a5 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa143c6b8 // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+ ".inst 0xa143c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
"addvl x21, x21, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+ ".inst 0x80880380 // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
"subs x23, x23, #0x1\n"
- ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
- ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
- ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
- "ld1w { z0.s }, p0/Z, [x26]\n"
- ".inst 0x808201a0 // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
- ".inst 0xa140c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n"
- ".inst 0x808601a1 // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
- ".inst 0x808a01a2 // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
- ".inst 0x808e01a3 // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
- "ld1w { z13.s }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0x80830180 // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
- ".inst 0xa141c6aa // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0x80870181 // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
- ".inst 0x808b0182 // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
- ".inst 0x808f0183 // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
- "ld1w { z12.s }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0xa142c6ab // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
- ".inst 0x80900340 // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
- ".inst 0x80940341 // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
- ".inst 0x80980342 // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
- ".inst 0x809c0343 // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
- "ld1w { z26.s }, p0/Z, [x26, #3, MUL VL]\n"
+ ".inst 0x80890381 // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+ ".inst 0x808a0382 // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+ ".inst 0x808b0383 // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+ "ld1w { z28.s }, p0/Z, [x26]\n"
+ ".inst 0x808c02c0 // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+ ".inst 0xa040c6a9 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+ ".inst 0x808d02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+ ".inst 0x808e02c2 // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+ ".inst 0x808f02c3 // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+ "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x808403c0 // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+ ".inst 0xa041c6ad // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0x808503c1 // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+ ".inst 0x808603c2 // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+ ".inst 0x808703c3 // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+ "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+ ".inst 0xa042c6a5 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+ ".inst 0x80930280 // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
+ ".inst 0x80970281 // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
+ ".inst 0x809b0282 // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
+ ".inst 0x809f0283 // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
+ "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
"addvl x26, x26, #4\n"
- ".inst 0xa143c6b8 // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+ ".inst 0xa143c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
"addvl x21, x21, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
- ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
- ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
- ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
- ".inst 0x808201a0 // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
- ".inst 0x808601a1 // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
- ".inst 0x808a01a2 // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
- ".inst 0x808e01a3 // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
- ".inst 0x80830180 // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
- ".inst 0x80870181 // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
- ".inst 0x808b0182 // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
- ".inst 0x808f0183 // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
- ".inst 0x80900340 // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
- ".inst 0x80940341 // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
- ".inst 0x80980342 // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
- ".inst 0x809c0343 // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
+ ".inst 0x80880380 // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
+ ".inst 0x80890381 // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+ ".inst 0x808a0382 // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+ ".inst 0x808b0383 // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+ ".inst 0x808c02c0 // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+ ".inst 0x808d02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+ ".inst 0x808e02c2 // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+ ".inst 0x808f02c3 // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+ ".inst 0x808403c0 // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+ ".inst 0x808503c1 // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+ ".inst 0x808603c2 // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+ ".inst 0x808703c3 // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+ ".inst 0x80930280 // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
+ ".inst 0x80970281 // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
+ ".inst 0x809b0282 // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
+ ".inst 0x809f0283 // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
"8:" // K oddments
"cbz x22, 10f\n"
"9:" // K oddments: Loop
- "ld1w { z0.s }, p0/Z, [x26]\n"
+ "ld1w { z8.s }, p0/Z, [x26]\n"
"subs x22, x22, #0x1\n"
"addvl x26, x26, #1\n"
- ".inst 0xa140c6b3 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n"
+ ".inst 0xa140c6a3 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21]\n"
"addvl x21, x21, #4\n"
- ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
- ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
- ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
- ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+ ".inst 0x80830100 // fmopa za0.s, p0/M, p0/M, z8.s, z3.s\n"
+ ".inst 0x80870101 // fmopa za1.s, p0/M, p0/M, z8.s, z7.s\n"
+ ".inst 0x808b0102 // fmopa za2.s, p0/M, p0/M, z8.s, z11.s\n"
+ ".inst 0x808f0103 // fmopa za3.s, p0/M, p0/M, z8.s, z15.s\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x15, #1, 14f\n"
@@ -240,25 +239,25 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xa041c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa042c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
- ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13]\n"
+ ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
"addvl x14, x14, #16\n"
- ".inst 0xa061c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
- ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
- ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -266,15 +265,15 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
- ".inst 0xa061c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa061c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
"addvl x13, x13, #16\n"
"blt 13b\n"
@@ -312,18 +311,18 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 17f\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x24, x24, x22\n"
@@ -332,66 +331,66 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
"18:" // Store to output array: Skip activation: End
"cntw x20\n"
"cmp x24, x20\n"
- "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x20, x24, x20, LT\n"
"lsr x21, x20, #0x2\n"
- "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"and x20, x20, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
- ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
- ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"add x12, x12, #0x4\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"add x25, x25, x23\n"
- ".inst 0xa160c323 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
+ ".inst 0xa160c333 // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"blt 19b\n"
"20:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 21f\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
- ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
- ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+ ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 21f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+ ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
"add x25, x25, x23\n"
"beq 21f\n"
- ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+ ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
"21:" // Store to output array: Accumulator row 0 oddments: End
"22:" // Store to output array: End
"tbz x15, #0, 24f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"23:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x14, x14, #16\n"
@@ -415,4 +414,3 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
index 51e8c43335..9bc1f83100 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 2, 2, 1> transforms = {};
- cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
index 87d7827c5b..3c475044e2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z21.s, #1.0\n"
- ".inst 0xa00a428f // ldnt1w { z14.s-z15.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0x808e02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
- ".inst 0x808f02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
- ".inst 0x808e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
- ".inst 0x808f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+ "fmov z12.s, #1.0\n"
+ ".inst 0xa10a4289 // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x80810180 // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890181 // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+ ".inst 0x80810182 // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+ ".inst 0x80890183 // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -164,75 +163,75 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"madd x21, x10, x20, x21\n" // bptr = B + n * kstride_bytes
"cbz x23, 8f\n"
"subs x23, x23, #0x1\n"
- ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
- ".inst 0xa14046bf // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x21]\n"
- ".inst 0xa0414768 // ld1w { z8.s-z9.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04146a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa1424772 // ld1w { z18.s, z26.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa04246b1 // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xa1434776 // ld1w { z22.s, z30.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa0404772 // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa04046a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+ ".inst 0xa0414764 // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04146bb // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa042476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04246b5 // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0434766 // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14346ac // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+ ".inst 0xa04346a9 // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
"addvl x21, x21, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+ ".inst 0x80820240 // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
"subs x23, x23, #0x1\n"
- ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
- ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
- ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
- ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
- ".inst 0x80820100 // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
- ".inst 0xa14046bf // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x21]\n"
- ".inst 0x80830101 // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
- ".inst 0x80820122 // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
- ".inst 0x80830123 // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
- ".inst 0xa0414768 // ld1w { z8.s-z9.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0x80900240 // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
- ".inst 0xa04146a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0x80910241 // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
- ".inst 0x80900342 // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
- ".inst 0x80910343 // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
- ".inst 0xa1424772 // ld1w { z18.s, z26.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa04246b1 // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0x808402c0 // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
- ".inst 0x808c02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
- ".inst 0x808403c2 // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
- ".inst 0x808c03c3 // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
- ".inst 0xa1434776 // ld1w { z22.s, z30.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0x80830241 // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+ ".inst 0x80820262 // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+ ".inst 0x80830263 // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+ ".inst 0xa0404772 // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+ ".inst 0x809a0080 // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+ ".inst 0xa04046a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+ ".inst 0x809b0081 // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+ ".inst 0x809a00a2 // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+ ".inst 0x809b00a3 // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+ ".inst 0xa0414764 // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0x80940140 // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+ ".inst 0xa04146bb // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0x80950141 // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+ ".inst 0x80940162 // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+ ".inst 0x80950163 // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+ ".inst 0xa042476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa04246b5 // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0x808800c0 // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+ ".inst 0x808900c1 // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+ ".inst 0x808800e2 // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+ ".inst 0x808900e3 // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
+ ".inst 0xa0434766 // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14346ac // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+ ".inst 0xa04346a9 // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
"addvl x21, x21, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
- ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
- ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
- ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
- ".inst 0x80820100 // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
- ".inst 0x80830101 // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
- ".inst 0x80820122 // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
- ".inst 0x80830123 // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
- ".inst 0x80900240 // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
- ".inst 0x80910241 // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
- ".inst 0x80900342 // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
- ".inst 0x80910343 // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
- ".inst 0x808402c0 // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
- ".inst 0x808c02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
- ".inst 0x808403c2 // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
- ".inst 0x808c03c3 // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
+ ".inst 0x80820240 // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
+ ".inst 0x80830241 // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+ ".inst 0x80820262 // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+ ".inst 0x80830263 // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+ ".inst 0x809a0080 // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+ ".inst 0x809b0081 // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+ ".inst 0x809a00a2 // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+ ".inst 0x809b00a3 // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+ ".inst 0x80940140 // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+ ".inst 0x80950141 // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+ ".inst 0x80940162 // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+ ".inst 0x80950163 // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+ ".inst 0x808800c0 // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+ ".inst 0x808900c1 // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+ ".inst 0x808800e2 // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+ ".inst 0x808900e3 // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
"8:" // K oddments
"cbz x22, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
+ ".inst 0xa040477e // ld1w { z30.s-z31.s }, pn9.b/Z, [x27]\n"
"subs x22, x22, #0x1\n"
"addvl x27, x27, #2\n"
- ".inst 0xa14046b7 // ld1w { z23.s, z31.s }, pn9.b/Z, [x21]\n"
+ ".inst 0xa14046a5 // ld1w { z5.s, z13.s }, pn9.b/Z, [x21]\n"
"addvl x21, x21, #2\n"
- ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
- ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
- ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
- ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+ ".inst 0x808503c0 // fmopa za0.s, p0/M, p0/M, z30.s, z5.s\n"
+ ".inst 0x808d03c1 // fmopa za1.s, p0/M, p0/M, z30.s, z13.s\n"
+ ".inst 0x808503e2 // fmopa za2.s, p0/M, p0/M, z31.s, z5.s\n"
+ ".inst 0x808d03e3 // fmopa za3.s, p0/M, p0/M, z31.s, z13.s\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -241,24 +240,24 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 30f\n"
@@ -266,16 +265,16 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa061c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
@@ -310,16 +309,16 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 17f\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -347,16 +346,16 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -365,44 +364,44 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"21:" // Store to output array: Skip activation: End
"cntw x23\n"
"cmp x25, x23\n"
- "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
- "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+ ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
+ ".inst 0xa1604357 // st1w { z23.s, z31.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 24f\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"24:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -416,8 +415,8 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"25:" // Store to output array: Accumulator row 1 loop
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
@@ -433,8 +432,8 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"cbz x20, 27f\n"
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
- ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
- ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+ ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
"subs x20, x20, #0x1\n"
".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
"add x26, x26, x24\n"
@@ -450,14 +449,14 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -481,4 +480,3 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
index a315ebb323..165e25dd8f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 4, 1, 1> transforms = {};
- cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
index 291a7ced5a..ae1f812442 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "fmov z8.s, #1.0\n"
- "ldnt1w { z27.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0x809b2500 // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
- ".inst 0x809b2501 // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
- ".inst 0x809b2502 // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
- ".inst 0x809b2503 // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+ "fmov z11.s, #1.0\n"
+ "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0x808d2560 // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2561 // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+ ".inst 0x808d2563 // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -164,75 +163,75 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"madd x21, x10, x20, x21\n" // bptr = B + n * kstride_bytes
"cbz x23, 8f\n"
"subs x23, x23, #0x1\n"
- ".inst 0xa040c364 // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n"
- "ldnt1w { z29.s }, p1/Z, [x21]\n"
- ".inst 0xa041c36c // ld1w { z12.s-z15.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ldnt1w { z23.s }, p1/Z, [x21, #1, MUL VL]\n"
- ".inst 0xa042c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1w { z21.s }, p1/Z, [x21, #2, MUL VL]\n"
- ".inst 0xa143c372 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa140c360 // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+ "ldnt1w { z19.s }, p1/Z, [x21]\n"
+ ".inst 0xa141c371 // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+ ".inst 0xa142c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xa143c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
"addvl x21, x21, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+ ".inst 0x80932400 // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
"subs x23, x23, #0x1\n"
- ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
- ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
- ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
- ".inst 0xa040c364 // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n"
- ".inst 0x80972580 // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
- "ldnt1w { z29.s }, p1/Z, [x21]\n"
- ".inst 0x809725a1 // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
- ".inst 0x809725c2 // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
- ".inst 0x809725e3 // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
- ".inst 0xa041c36c // ld1w { z12.s-z15.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0x80952400 // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
- "ldnt1w { z23.s }, p1/Z, [x21, #1, MUL VL]\n"
- ".inst 0x80952421 // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
- ".inst 0x80952442 // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
- ".inst 0x80952463 // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
- ".inst 0xa042c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1w { z21.s }, p1/Z, [x21, #2, MUL VL]\n"
- ".inst 0x809b2640 // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
- ".inst 0x809b26c1 // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
- ".inst 0x809b2742 // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
- ".inst 0x809b27c3 // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
- ".inst 0xa143c372 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0x80932481 // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+ ".inst 0x80932502 // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+ ".inst 0x80932583 // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+ ".inst 0xa140c360 // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+ ".inst 0x80962620 // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+ "ldnt1w { z19.s }, p1/Z, [x21]\n"
+ ".inst 0x809626a1 // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+ ".inst 0x80962722 // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+ ".inst 0x809627a3 // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+ ".inst 0xa141c371 // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0x80972600 // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+ "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+ ".inst 0x80972681 // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+ ".inst 0x80972702 // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+ ".inst 0x80972783 // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+ ".inst 0xa142c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+ ".inst 0x80822460 // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+ ".inst 0x808224e1 // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+ ".inst 0x80822562 // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+ ".inst 0x808225e3 // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
+ ".inst 0xa143c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
"addvl x21, x21, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
- ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
- ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
- ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
- ".inst 0x80972580 // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
- ".inst 0x809725a1 // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
- ".inst 0x809725c2 // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
- ".inst 0x809725e3 // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
- ".inst 0x80952400 // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
- ".inst 0x80952421 // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
- ".inst 0x80952442 // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
- ".inst 0x80952463 // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
- ".inst 0x809b2640 // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
- ".inst 0x809b26c1 // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
- ".inst 0x809b2742 // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
- ".inst 0x809b27c3 // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
+ ".inst 0x80932400 // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
+ ".inst 0x80932481 // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+ ".inst 0x80932502 // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+ ".inst 0x80932583 // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+ ".inst 0x80962620 // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+ ".inst 0x809626a1 // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+ ".inst 0x80962722 // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+ ".inst 0x809627a3 // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+ ".inst 0x80972600 // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+ ".inst 0x80972681 // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+ ".inst 0x80972702 // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+ ".inst 0x80972783 // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+ ".inst 0x80822460 // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+ ".inst 0x808224e1 // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+ ".inst 0x80822562 // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+ ".inst 0x808225e3 // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
"8:" // K oddments
"cbz x22, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040c364 // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n"
+ ".inst 0xa140c373 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27]\n"
"subs x22, x22, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
"addvl x21, x21, #1\n"
- ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
- ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
- ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
- ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+ ".inst 0x808b2660 // fmopa za0.s, p1/M, p1/M, z19.s, z11.s\n"
+ ".inst 0x808b26e1 // fmopa za1.s, p1/M, p1/M, z23.s, z11.s\n"
+ ".inst 0x808b2762 // fmopa za2.s, p1/M, p1/M, z27.s, z11.s\n"
+ ".inst 0x808b27e3 // fmopa za3.s, p1/M, p1/M, z31.s, z11.s\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -240,25 +239,25 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
- ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa042c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa060c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
"addvl x15, x15, #16\n"
".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 42f\n"
@@ -267,15 +266,15 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
".inst 0xa060c1cc // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa061c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa062c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 42f\n"
@@ -294,16 +293,16 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Skip activation: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 15b\n"
"16:" // Store to output array: Skip activation: Accumulator row 0 oddments
@@ -329,30 +328,30 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Skip activation: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ "st1w { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 18b\n"
"19:" // Store to output array: Skip activation: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- "st1w { z4.s }, p0, [x26]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ "st1w { z24.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1w { z5.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 20f\n"
- "st1w { z6.s }, p0, [x26]\n"
+ "st1w { z26.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -364,30 +363,30 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Skip activation: Accumulator row 2 loop
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 21b\n"
"22:" // Store to output array: Skip activation: Accumulator row 2 oddments
"cbz x20, 23f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- "st1w { z20.s }, p0, [x26]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ "st1w { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
"subs x20, x20, #0x1\n"
- "st1w { z21.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 23f\n"
- "st1w { z22.s }, p0, [x26]\n"
+ "st1w { z14.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
@@ -399,30 +398,30 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"and x20, x22, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Skip activation: Accumulator row 3 loop
- ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
- "st1w { z4.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z5.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z6.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z7.s }, p0, [x26]\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 24b\n"
"25:" // Store to output array: Skip activation: Accumulator row 3 oddments
"cbz x20, 26f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 26f\n"
"subs x20, x20, #0x1\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 26f\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"26:" // Store to output array: Skip activation: Accumulator row 3 oddments: End
"subs x25, x25, x22\n"
@@ -431,40 +430,40 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"27:" // Store to output array: Skip activation: End
"cntw x23\n"
"cmp x25, x23\n"
- "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
"csel x22, x25, x23, LT\n"
"lsr x21, x22, #0x2\n"
- "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
"mov x12, #0x0\n"
"and x20, x22, #0x3\n"
"cbz x21, 29f\n"
"28:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
- "st1w { z20.s }, p0, [x26]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z21.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z22.s }, p0, [x26]\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z23.s }, p0, [x26]\n"
+ "st1w { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 28b\n"
"29:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 30f\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb28 // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
- "st1w { z8.s }, p0, [x26]\n"
+ ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1w { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
"subs x20, x20, #0x1\n"
- "st1w { z9.s }, p0, [x26]\n"
+ "st1w { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 30f\n"
- "st1w { z10.s }, p0, [x26]\n"
+ "st1w { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"30:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -476,24 +475,24 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"and x20, x22, #0x3\n"
"cbz x21, 32f\n"
"31:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 31b\n"
"32:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 33f\n"
".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 33f\n"
@@ -514,7 +513,7 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"cbz x21, 35f\n"
"34:" // Store to output array: Accumulator row 2 loop
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1w { z17.s }, p0, [x26]\n"
@@ -530,7 +529,7 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"cbz x20, 36f\n"
".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 36f\n"
@@ -550,24 +549,24 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
"and x20, x20, #0x3\n"
"cbz x21, 38f\n"
"37:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
- "st1w { z20.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1w { z21.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"add x12, x12, #0x4\n"
- "st1w { z22.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z23.s }, p0, [x26]\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 37b\n"
"38:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 39f\n"
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 39f\n"
@@ -586,10 +585,10 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -613,4 +612,3 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
index b8bcd53c21..7b3cc77867 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
- cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
index 929af04032..aba677b158 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa041c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa042c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x13, x13, #16\n"
@@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa01cc299 // ldnt1w { z24.s-z27.s }, p8/Z, [x20, x28, LSL #2]\n"
- ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
- ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
- ".inst 0xc0902742 // addha za2.s, p1/M, p1/M, z26.s\n"
- ".inst 0xc0902763 // addha za3.s, p1/M, p1/M, z27.s\n"
+ ".inst 0xa11cc289 // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+ ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n"
+ ".inst 0xc09024a1 // addha za1.s, p1/M, p1/M, z5.s\n"
+ ".inst 0xc0902522 // addha za2.s, p1/M, p1/M, z9.s\n"
+ ".inst 0xc09025a3 // addha za3.s, p1/M, p1/M, z13.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x28\n"
"mov x21, x9\n"
@@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"madd x23, x28, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- "ld1b { z10.b }, p1/Z, [x25]\n"
- ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
- "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
- ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
- ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
- ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ ".inst 0xa0842680 // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
- ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
- ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
- "ld1b { z10.b }, p1/Z, [x25]\n"
- ".inst 0xa08c2600 // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
- ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa08d2601 // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
- ".inst 0xa08e2602 // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
- ".inst 0xa08f2603 // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
- "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
- ".inst 0xa09826a0 // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
- ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa09926a1 // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
- ".inst 0xa09a26a2 // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
- ".inst 0xa09b26a3 // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
- "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
- ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0xa0802660 // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
- ".inst 0xa0812661 // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
- ".inst 0xa0822662 // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
- ".inst 0xa0832663 // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
- "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0862682 // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa0872683 // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa0982560 // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0992561 // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa09a2562 // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa09b2563 // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa09c2440 // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa09d2441 // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa09e2442 // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa09f2443 // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xa09025c0 // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa09125c1 // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa09225c2 // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
- ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
- ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
- ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
- ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
- ".inst 0xa08c2600 // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
- ".inst 0xa08d2601 // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
- ".inst 0xa08e2602 // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
- ".inst 0xa08f2603 // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
- ".inst 0xa09826a0 // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
- ".inst 0xa09926a1 // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
- ".inst 0xa09a26a2 // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
- ".inst 0xa09b26a3 // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
- ".inst 0xa0802660 // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
- ".inst 0xa0812661 // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
- ".inst 0xa0822662 // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
- ".inst 0xa0832663 // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+ ".inst 0xa0842680 // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0862682 // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa0872683 // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ ".inst 0xa0982560 // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa0992561 // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa09a2562 // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa09b2563 // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ ".inst 0xa09c2440 // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa09d2441 // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa09e2442 // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa09f2443 // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ ".inst 0xa09025c0 // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa09125c1 // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa09225c2 // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1b { z10.b }, p1/Z, [x25]\n"
+ "ld1b { z16.b }, p1/Z, [x25]\n"
"subs x21, x21, #0x1\n"
"addvl x25, x25, #1\n"
- ".inst 0xa04086fc // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa04086e4 // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
- ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
- ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
- ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ ".inst 0xa0842600 // smopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
+ ".inst 0xa0852601 // smopa za1.s, p1/M, p1/M, z16.b, z5.b\n"
+ ".inst 0xa0862602 // smopa za2.s, p1/M, p1/M, z16.b, z6.b\n"
+ ".inst 0xa0872603 // smopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- "ld1w { z14.s }, p1/Z, [x25]\n"
+ "ld1w { z15.s }, p1/Z, [x25]\n"
"addvl x25, x25, #1\n"
- ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
- ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
- ".inst 0xc09125c2 // addva za2.s, p1/M, p1/M, z14.s\n"
- ".inst 0xc09125c3 // addva za3.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125e0 // addva za0.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e1 // addva za1.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
"tbz x14, #1, 14f\n"
"tbz x14, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5b8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x13]\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa042c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
+ ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
"addvl x13, x13, #16\n"
- ".inst 0xa061c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x4, MUL VL]\n"
- ".inst 0xa062c578 // st1w { z24.s-z27.s }, pn9.b, [x11, #0x8, MUL VL]\n"
- ".inst 0xa063c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ ".inst 0xa061c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa062c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
"addvl x11, x11, #16\n"
"blt 11b\n"
"b 21f\n"
@@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa061c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa060c564 // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x8, MUL VL]\n"
- ".inst 0xa063c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ ".inst 0xa062c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
"addvl x11, x11, #16\n"
"blt 13b\n"
"b 21f\n"
@@ -277,17 +276,17 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"ldr x24, [%x[args], %[offsetof_C]]\n"
"add x24, x24, x28\n" // C += n
"sub x23, x10, x9\n"
- "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x22, [%x[args], %[offsetof_ldcb]]\n"
"madd x24, x9, x22, x24\n" // C += m * ldc
- "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
"ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x14, #2, 15f\n"
@@ -295,10 +294,10 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"add x21, x21, x28\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
+ ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
+ ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x20\n"
"whilelt p0.b, x28, x27\n"
@@ -311,22 +310,22 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"16:" // Store to output array: Accumulator row 0 loop
".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc1aca41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc1a4a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
- ".inst 0xc1ada41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
- ".inst 0xc1aea416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ ".inst 0xc1a5a41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+ ".inst 0xc1a6a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
"add x12, x12, #0x2\n"
"cmp x12, x21, LSL #1\n"
- ".inst 0xc1afa410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
- ".inst 0xc1a4a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
- ".inst 0xc1a5a23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
- ".inst 0xc1a6a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
- ".inst 0xc1a7a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
- ".inst 0xc1a1a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
- ".inst 0xc1a1a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
- ".inst 0xc1a1a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
- ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+ ".inst 0xc1a7a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+ ".inst 0xc1aca23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc1ada23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+ ".inst 0xc1aea236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ ".inst 0xc1afa230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+ ".inst 0xc1a0a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+ ".inst 0xc1a0a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
"uzp1 z19.b, z26.b, z28.b\n"
@@ -344,29 +343,29 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc0860002 // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc086000a // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc1aca402 // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
- ".inst 0xc0860090 // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
- ".inst 0xc08600ca // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
- ".inst 0xc1ada418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
- ".inst 0xc1aea410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
- ".inst 0xc1afa40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
- ".inst 0xc1a4a222 // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
- ".inst 0xc1a5a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
- ".inst 0xc1a6a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
- ".inst 0xc1a7a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
- ".inst 0xc1a1a302 // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
- ".inst 0xc1a1a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
- ".inst 0xc1a1a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
- ".inst 0xc1b4c6a2 // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
- "uzp1 z23.b, z2.b, z24.b\n"
- ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ ".inst 0xc1a4a40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
+ ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600de // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1a5a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+ ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+ ".inst 0xc1a7a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+ ".inst 0xc1aca22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+ ".inst 0xc1ada238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+ ".inst 0xc1aea23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+ ".inst 0xc1afa23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+ ".inst 0xc1a0a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+ ".inst 0xc1a0a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
- "uzp1 z16.b, z16.b, z10.b\n"
- "uzp1 z16.b, z23.b, z16.b\n"
+ ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+ "uzp1 z17.b, z10.b, z24.b\n"
+ ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z26.b, z30.b\n"
+ "uzp1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p0, [x24]\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"19:" // Store to output array: End
@@ -374,14 +373,14 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"20:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa041c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
- ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x13, x13, #16\n"
@@ -405,4 +404,3 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
index 954b0da0e1..79990f72e5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
- cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
index 0b642818e2..7033de5fe3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc09026a1 // addha za1.s, p1/M, p1/M, z21.s\n"
- ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc09026a3 // addha za3.s, p1/M, p1/M, z21.s\n"
+ ".inst 0xa00a4299 // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
+ ".inst 0xc0902702 // addha za2.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902723 // addha za3.s, p1/M, p1/M, z25.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -152,75 +151,75 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ ".inst 0xa0912460 // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
- ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
- ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
- ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa08825c0 // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
- ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa08925c1 // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
- ".inst 0xa08825e2 // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
- ".inst 0xa08925e3 // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
- ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa0942400 // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
- ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa09c2401 // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
- ".inst 0xa0942422 // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0822480 // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
- ".inst 0xa08a2481 // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
- ".inst 0xa08224a2 // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
- ".inst 0xa08a24a3 // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
- ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa0992461 // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa0912562 // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa0992563 // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa0962680 // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0972681 // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa0962782 // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa0972783 // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa09026a0 // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa09826a1 // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa09027a2 // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa09827a3 // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa08724a0 // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa08f24a1 // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa08725a2 // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa08f25a3 // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
- ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
- ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
- ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
- ".inst 0xa08825c0 // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
- ".inst 0xa08925c1 // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
- ".inst 0xa08825e2 // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
- ".inst 0xa08925e3 // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
- ".inst 0xa0942400 // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
- ".inst 0xa09c2401 // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
- ".inst 0xa0942422 // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- ".inst 0xa0822480 // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
- ".inst 0xa08a2481 // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
- ".inst 0xa08224a2 // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
- ".inst 0xa08a24a3 // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+ ".inst 0xa0912460 // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+ ".inst 0xa0992461 // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa0912562 // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa0992563 // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa0962680 // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa0972681 // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa0962782 // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa0972783 // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa09026a0 // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa09826a1 // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa09027a2 // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa09827a3 // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa08724a0 // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa08f24a1 // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa08725a2 // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa08f25a3 // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa1400773 // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
".inst 0xa04006f0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
- ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
- ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
- ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ ".inst 0xa0902660 // smopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+ ".inst 0xa0912661 // smopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+ ".inst 0xa0902762 // smopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+ ".inst 0xa0912763 // smopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
"bgt 9b\n"
"10:" // K oddments: End
".inst 0xa040476e // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
@@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 24f\n"
@@ -277,13 +276,13 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"ldr x26, [%x[args], %[offsetof_C]]\n"
"add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
- "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
"madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
"ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
@@ -291,10 +290,10 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"add x21, x21, x10\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
+ ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
+ ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.h, x10, x9\n"
@@ -305,26 +304,26 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
- ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"add x12, x12, #0x4\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
- ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
- ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
- ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
- ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z12.h, z28.h\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z8.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z13.h, z29.h\n"
- "uzp1 z17.h, z14.h, z30.h\n"
+ "uzp1 z16.h, z5.h, z9.h\n"
+ "uzp1 z17.h, z6.h, z10.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z15.h, z31.h\n"
+ "uzp1 z16.h, z7.h, z11.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -332,27 +331,27 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a0aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
- ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
- ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z28.h, z12.h\n"
+ ".inst 0xc1a2aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc1a3aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z8.h, z4.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
"subs x20, x20, #0x1\n"
- "uzp1 z16.h, z29.h, z13.h\n"
+ "uzp1 z16.h, z9.h, z5.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
- "uzp1 z16.h, z30.h, z14.h\n"
+ "uzp1 z16.h, z10.h, z6.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
@@ -367,25 +366,25 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
- ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
"add x12, x12, #0x4\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1abab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z4.h, z16.h\n"
+ ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z20.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z5.h, z17.h\n"
- "uzp1 z17.h, z6.h, z18.h\n"
+ "uzp1 z16.h, z5.h, z21.h\n"
+ "uzp1 z17.h, z6.h, z22.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z7.h, z19.h\n"
+ "uzp1 z16.h, z7.h, z23.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -393,27 +392,27 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
"blt 19b\n"
"20:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 21f\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
- ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a0aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1abab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
- ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
- ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z20.h, z16.h\n"
+ "uzp1 z16.h, z4.h, z16.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
"subs x20, x20, #0x1\n"
- "uzp1 z16.h, z21.h, z17.h\n"
+ "uzp1 z16.h, z5.h, z17.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
- "uzp1 z16.h, z22.h, z18.h\n"
+ "uzp1 z16.h, z6.h, z18.h\n"
"st1b { z16.h }, p0, [x26]\n"
"21:" // Store to output array: Accumulator row 1 oddments: End
"22:" // Store to output array: End
@@ -452,4 +451,3 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
index 420c219af5..ef39cbbb28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
- cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
index 0d0e3da224..4601f05501 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n"
+ "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902500 // addha za0.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902501 // addha za1.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902502 // addha za2.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902503 // addha za3.s, p1/M, p1/M, z8.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- "ldnt1b { z0.b }, p1/Z, [x23]\n"
- ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ ".inst 0xa08e2480 // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
- ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
- ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
- ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa0892660 // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
- "ldnt1b { z0.b }, p1/Z, [x23]\n"
- ".inst 0xa08926e1 // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
- ".inst 0xa0892762 // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
- ".inst 0xa08927e3 // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
- ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa0952600 // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
- "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa0952681 // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
- ".inst 0xa0952702 // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
- ".inst 0xa0952783 // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
- ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa08c2440 // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
- ".inst 0xa08c24c1 // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
- ".inst 0xa08c2542 // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
- ".inst 0xa08c25c3 // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
- ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa08e24a1 // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa08e24c2 // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa08e24e3 // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa09f2680 // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa09f26a1 // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa09f26c2 // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa09f26e3 // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa08d2700 // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa08d2721 // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa08d2742 // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa08d2763 // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa09d2500 // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa09d2521 // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa09d2542 // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09d2563 // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
- ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
- ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
- ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
- ".inst 0xa0892660 // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
- ".inst 0xa08926e1 // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
- ".inst 0xa0892762 // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
- ".inst 0xa08927e3 // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
- ".inst 0xa0952600 // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
- ".inst 0xa0952681 // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
- ".inst 0xa0952702 // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
- ".inst 0xa0952783 // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
- ".inst 0xa08c2440 // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
- ".inst 0xa08c24c1 // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
- ".inst 0xa08c2542 // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
- ".inst 0xa08c25c3 // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+ ".inst 0xa08e2480 // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+ ".inst 0xa08e24a1 // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa08e24c2 // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa08e24e3 // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa09f2680 // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ ".inst 0xa09f26a1 // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa09f26c2 // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa09f26e3 // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa08d2700 // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ ".inst 0xa08d2721 // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa08d2742 // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa08d2763 // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa09d2500 // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa09d2521 // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa09d2542 // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09d2563 // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1b { z0.b }, p1/Z, [x23]\n"
+ "ld1b { z15.b }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
- ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
- ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
- ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ ".inst 0xa08f2640 // smopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
+ ".inst 0xa08f26c1 // smopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
+ ".inst 0xa08f2742 // smopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
+ ".inst 0xa08f27c3 // smopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- ".inst 0xa040c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27]\n"
+ ".inst 0xa140c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
"addvl x27, x27, #4\n"
- ".inst 0xc0912400 // addva za0.s, p1/M, p1/M, z0.s\n"
- ".inst 0xc0912421 // addva za1.s, p1/M, p1/M, z1.s\n"
- ".inst 0xc0912442 // addva za2.s, p1/M, p1/M, z2.s\n"
- ".inst 0xc0912463 // addva za3.s, p1/M, p1/M, z3.s\n"
+ ".inst 0xc0912460 // addva za0.s, p1/M, p1/M, z3.s\n"
+ ".inst 0xc09124e1 // addva za1.s, p1/M, p1/M, z7.s\n"
+ ".inst 0xc0912562 // addva za2.s, p1/M, p1/M, z11.s\n"
+ ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
"tbz x16, #1, 14f\n"
"tbz x16, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 30f\n"
@@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
@@ -277,22 +276,22 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"ldr x26, [%x[args], %[offsetof_C]]\n"
"add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
- "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
"madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
"ldr w21, [%x[args], %[offsetof_n_0]]\n"
"add x21, x21, x10\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
"add x20, x20, x21, LSL #2\n"
- "ld1w { z8.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"add x20, x20, x21, LSL #2\n"
- "ld1w { z7.s }, p0/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.s, x10, x9\n"
@@ -303,30 +302,30 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
- "st1b { z12.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z13.s }, p0, [x26]\n"
+ "st1b { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z14.s }, p0, [x26]\n"
+ "st1b { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z15.s }, p0, [x26]\n"
+ "st1b { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
- ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
@@ -347,38 +346,38 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"and x20, x22, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z19.s }, p0, [x26]\n"
+ "st1b { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 19b\n"
"20:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 21f\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1a4ccbc // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
- "st1b { z28.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
"subs x20, x20, #0x1\n"
- "st1b { z29.s }, p0, [x26]\n"
+ "st1b { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
- "st1b { z30.s }, p0, [x26]\n"
+ "st1b { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"21:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -391,30 +390,30 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
- ".inst 0xc1a4ccb8 // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
- "st1b { z24.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1b4cea8 // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ "st1b { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z25.s }, p0, [x26]\n"
+ "st1b { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z26.s }, p0, [x26]\n"
+ "st1b { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z27.s }, p0, [x26]\n"
+ "st1b { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 24f\n"
".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
"st1b { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
@@ -435,52 +434,52 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
"and x20, x20, #0x3\n"
"cbz x21, 26f\n"
"25:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
- ".inst 0xc1a4ccb4 // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
- "st1b { z20.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z21.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z22.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z23.s }, p0, [x26]\n"
+ "st1b { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 25b\n"
"26:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 27f\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
- ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- ".inst 0xc1a4cca0 // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
- "st1b { z0.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
"subs x20, x20, #0x1\n"
- "st1b { z1.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
- "st1b { z2.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"27:" // Store to output array: Accumulator row 3 oddments: End
"28:" // Store to output array: End
"tbz x16, #0, 30f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -504,4 +503,3 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
index c969c7aaff..b9d8b60c8d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -32,7 +32,7 @@ namespace arm_gemm
{
// Implementations
-void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
{
@@ -40,7 +40,7 @@ public:
typedef int8_t operand_type;
typedef int32_t result_type;
- typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
/* Kernel blocking parameters */
static unsigned int out_height()
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
- cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
index 12e714a471..d11faa634d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -32,10 +31,8 @@
namespace arm_gemm {
-void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
{
- ARM_COMPUTE_UNUSED(act);
-
struct KernelArgs
{
KernelArgs(
@@ -96,12 +93,12 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"1:" // Initial accumulator load from buffer: Loop
".inst 0xa040c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11]\n"
".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa041c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x11, x11, #16\n"
@@ -119,11 +116,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa11bc28a // ldnt1w { z2.s, z6.s, z10.s, z14.s }, p8/Z, [x20, x27, LSL #2]\n"
- ".inst 0xc0900040 // addha za0.s, p0/M, p0/M, z2.s\n"
- ".inst 0xc09000c1 // addha za1.s, p0/M, p0/M, z6.s\n"
- ".inst 0xc0900142 // addha za2.s, p0/M, p0/M, z10.s\n"
- ".inst 0xc09001c3 // addha za3.s, p0/M, p0/M, z14.s\n"
+ ".inst 0xa11bc29b // ldnt1w { z19.s, z23.s, z27.s, z31.s }, p8/Z, [x20, x27, LSL #2]\n"
+ ".inst 0xc0900260 // addha za0.s, p0/M, p0/M, z19.s\n"
+ ".inst 0xc09002e1 // addha za1.s, p0/M, p0/M, z23.s\n"
+ ".inst 0xc0900362 // addha za2.s, p0/M, p0/M, z27.s\n"
+ ".inst 0xc09003e3 // addha za3.s, p0/M, p0/M, z31.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x27\n"
"mov x21, x28\n"
@@ -146,75 +143,75 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"madd x23, x27, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- "ld1b { z20.b }, p0/Z, [x24]\n"
- ".inst 0xa14086e9 // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
- "ld1b { z10.b }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0xa14186fa // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1b { z16.b }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0xa14286eb // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1b { z25.b }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1b { z30.b }, p0/Z, [x24]\n"
+ ".inst 0xa04086e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+ "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- ".inst 0xa14386e8 // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+ ".inst 0xa08003c0 // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
- ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
- ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
- "ld1b { z20.b }, p0/Z, [x24]\n"
- ".inst 0xa0920140 // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
- ".inst 0xa14086e9 // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa0960141 // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
- ".inst 0xa09a0142 // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
- ".inst 0xa09e0143 // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
- "ld1b { z10.b }, p0/Z, [x24, #1, MUL VL]\n"
- ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
- ".inst 0xa14186fa // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0870201 // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
- ".inst 0xa08b0202 // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
- ".inst 0xa08f0203 // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
- "ld1b { z16.b }, p0/Z, [x24, #2, MUL VL]\n"
- ".inst 0xa14286eb // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0xa0800320 // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
- ".inst 0xa0840321 // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
- ".inst 0xa0880322 // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
- ".inst 0xa08c0323 // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
- "ld1b { z25.b }, p0/Z, [x24, #3, MUL VL]\n"
+ ".inst 0xa08103c1 // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+ ".inst 0xa08203c2 // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+ ".inst 0xa08303c3 // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+ "ld1b { z30.b }, p0/Z, [x24]\n"
+ ".inst 0xa09802a0 // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
+ ".inst 0xa04086e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa09902a1 // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
+ ".inst 0xa09a02a2 // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
+ ".inst 0xa09b02a3 // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
+ "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa0840380 // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0850381 // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+ ".inst 0xa0860382 // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+ ".inst 0xa0870383 // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+ "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xa0900160 // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+ ".inst 0xa0910161 // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+ ".inst 0xa0920162 // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+ ".inst 0xa0930163 // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
+ "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
"addvl x24, x24, #4\n"
- ".inst 0xa14386e8 // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
- ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
- ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
- ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
- ".inst 0xa0920140 // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
- ".inst 0xa0960141 // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
- ".inst 0xa09a0142 // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
- ".inst 0xa09e0143 // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
- ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
- ".inst 0xa0870201 // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
- ".inst 0xa08b0202 // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
- ".inst 0xa08f0203 // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
- ".inst 0xa0800320 // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
- ".inst 0xa0840321 // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
- ".inst 0xa0880322 // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
- ".inst 0xa08c0323 // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
+ ".inst 0xa08003c0 // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
+ ".inst 0xa08103c1 // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+ ".inst 0xa08203c2 // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+ ".inst 0xa08303c3 // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+ ".inst 0xa09802a0 // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
+ ".inst 0xa09902a1 // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
+ ".inst 0xa09a02a2 // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
+ ".inst 0xa09b02a3 // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
+ ".inst 0xa0840380 // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+ ".inst 0xa0850381 // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+ ".inst 0xa0860382 // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+ ".inst 0xa0870383 // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+ ".inst 0xa0900160 // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+ ".inst 0xa0910161 // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+ ".inst 0xa0920162 // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+ ".inst 0xa0930163 // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1b { z20.b }, p0/Z, [x24]\n"
+ "ld1b { z22.b }, p0/Z, [x24]\n"
"subs x21, x21, #0x1\n"
"addvl x24, x24, #1\n"
- ".inst 0xa14086e1 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa14086f1 // ld1b { z17.b, z21.b, z25.b, z29.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
- ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
- ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
- ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+ ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+ ".inst 0xa09502c1 // smopa za1.s, p0/M, p0/M, z22.b, z21.b\n"
+ ".inst 0xa09902c2 // smopa za2.s, p0/M, p0/M, z22.b, z25.b\n"
+ ".inst 0xa09d02c3 // smopa za3.s, p0/M, p0/M, z22.b, z29.b\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x13, #1, 14f\n"
@@ -222,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xa040c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xa041c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
- ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
- ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
- ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa060c544 // st1w { z4.s-z7.s }, pn9.b, [x10]\n"
"addvl x11, x11, #16\n"
- ".inst 0xa061c554 // st1w { z20.s-z23.s }, pn9.b, [x10, #0x4, MUL VL]\n"
- ".inst 0xa062c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
- ".inst 0xa063c55c // st1w { z28.s-z31.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ ".inst 0xa061c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ ".inst 0xa062c55c // st1w { z28.s-z31.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
"addvl x10, x10, #16\n"
"blt 11b\n"
"b 20f\n"
@@ -248,16 +245,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa060c554 // st1w { z20.s-z23.s }, pn9.b, [x10]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa061c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa060c54c // st1w { z12.s-z15.s }, pn9.b, [x10]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa061c544 // st1w { z4.s-z7.s }, pn9.b, [x10, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
- ".inst 0xa063c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ ".inst 0xa062c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0xc, MUL VL]\n"
"addvl x10, x10, #16\n"
"blt 13b\n"
"b 20f\n"
@@ -293,32 +290,32 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
"16:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa160c2e0 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa160c2f0 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x23]\n"
"add x23, x23, x22\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa160c2e1 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
+ ".inst 0xa160c2f1 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x23]\n"
"add x23, x23, x22\n"
"beq 17f\n"
- ".inst 0xa160c2e2 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
+ ".inst 0xa160c2f2 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x23]\n"
"17:" // Store to output array: Accumulator row 0 oddments: End
"18:" // Store to output array: End
"tbz x13, #0, 20f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"19:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x11, x11, #16\n"
@@ -342,4 +339,3 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
index a0705e50cd..f05d2cf215 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -32,7 +32,7 @@ namespace arm_gemm
{
// Implementations
-void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
{
@@ -40,7 +40,7 @@ public:
typedef int8_t operand_type;
typedef int32_t result_type;
- typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
/* Kernel blocking parameters */
static unsigned int out_height()
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
- cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
index d7a7528211..47de894306 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -32,10 +31,8 @@
namespace arm_gemm {
-void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
{
- ARM_COMPUTE_UNUSED(act);
-
struct KernelArgs
{
KernelArgs(
@@ -96,12 +93,12 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"1:" // Initial accumulator load from buffer: Loop
".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
- ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa042c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -119,11 +116,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa10a429c // ldnt1w { z20.s, z28.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
".inst 0xc0900280 // addha za0.s, p0/M, p0/M, z20.s\n"
- ".inst 0xc0900381 // addha za1.s, p0/M, p0/M, z28.s\n"
+ ".inst 0xc09002a1 // addha za1.s, p0/M, p0/M, z21.s\n"
".inst 0xc0900282 // addha za2.s, p0/M, p0/M, z20.s\n"
- ".inst 0xc0900383 // addha za3.s, p0/M, p0/M, z28.s\n"
+ ".inst 0xc09002a3 // addha za3.s, p0/M, p0/M, z21.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -146,75 +143,75 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1400776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1410770 // ld1b { z16.b, z24.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa14106eb // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa0420768 // ld1b { z8.b-z9.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa04206f3 // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa040077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa14006e8 // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa0410762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa14106ff // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa042076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306fd // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04306f5 // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+ ".inst 0xa0800380 // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
- ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
- ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
- ".inst 0xa1400776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
- ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa08b0201 // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
- ".inst 0xa0830302 // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
- ".inst 0xa08b0303 // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
- ".inst 0xa1410770 // ld1b { z16.b, z24.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa0920100 // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
- ".inst 0xa14106eb // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa0930101 // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
- ".inst 0xa0920122 // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
- ".inst 0xa0930123 // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
- ".inst 0xa0420768 // ld1b { z8.b-z9.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa04206f3 // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0950080 // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
- ".inst 0xa09d0081 // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
- ".inst 0xa09500a2 // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
- ".inst 0xa09d00a3 // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+ ".inst 0xa0880381 // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+ ".inst 0xa08003a2 // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+ ".inst 0xa08803a3 // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+ ".inst 0xa040077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa0970040 // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+ ".inst 0xa14006e8 // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa09f0041 // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+ ".inst 0xa0970062 // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+ ".inst 0xa09f0063 // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+ ".inst 0xa0410762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa09001c0 // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+ ".inst 0xa14106ff // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa09801c1 // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+ ".inst 0xa09001e2 // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+ ".inst 0xa09801e3 // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+ ".inst 0xa042076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0940080 // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+ ".inst 0xa0950081 // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+ ".inst 0xa09400a2 // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+ ".inst 0xa09500a3 // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306fd // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa04306f5 // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
- ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
- ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
- ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
- ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
- ".inst 0xa08b0201 // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
- ".inst 0xa0830302 // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
- ".inst 0xa08b0303 // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
- ".inst 0xa0920100 // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
- ".inst 0xa0930101 // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
- ".inst 0xa0920122 // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
- ".inst 0xa0930123 // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
- ".inst 0xa0950080 // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
- ".inst 0xa09d0081 // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
- ".inst 0xa09500a2 // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
- ".inst 0xa09d00a3 // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+ ".inst 0xa0800380 // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
+ ".inst 0xa0880381 // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+ ".inst 0xa08003a2 // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+ ".inst 0xa08803a3 // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+ ".inst 0xa0970040 // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+ ".inst 0xa09f0041 // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+ ".inst 0xa0970062 // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+ ".inst 0xa09f0063 // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+ ".inst 0xa09001c0 // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+ ".inst 0xa09801c1 // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+ ".inst 0xa09001e2 // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+ ".inst 0xa09801e3 // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+ ".inst 0xa0940080 // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+ ".inst 0xa0950081 // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+ ".inst 0xa09400a2 // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+ ".inst 0xa09500a3 // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa1400776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa1400774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
- ".inst 0xa14006f1 // ld1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa14006e7 // ld1b { z7.b, z15.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
- ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
- ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
- ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+ ".inst 0xa0870280 // smopa za0.s, p0/M, p0/M, z20.b, z7.b\n"
+ ".inst 0xa08f0281 // smopa za1.s, p0/M, p0/M, z20.b, z15.b\n"
+ ".inst 0xa0870382 // smopa za2.s, p0/M, p0/M, z28.b, z7.b\n"
+ ".inst 0xa08f0383 // smopa za3.s, p0/M, p0/M, z28.b, z15.b\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -223,24 +220,24 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
- ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14]\n"
+ ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 23f\n"
@@ -248,16 +245,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
- ".inst 0xa061c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 23f\n"
@@ -275,32 +272,32 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
- ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
"add x26, x26, x23\n"
- ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n"
+ ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"add x12, x12, #0x4\n"
- ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n"
+ ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n"
+ ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"blt 15b\n"
"16:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
+ ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
- ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
+ ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"17:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -328,30 +325,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
"19:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n"
+ ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
- ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n"
+ ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n"
"20:" // Store to output array: Accumulator row 1 oddments: End
"21:" // Store to output array: End
"tbz x16, #0, 23f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"22:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -375,4 +372,3 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
index be1106da13..ce10ab30e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -32,7 +32,7 @@ namespace arm_gemm
{
// Implementations
-void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
{
@@ -40,7 +40,7 @@ public:
typedef int8_t operand_type;
typedef int32_t result_type;
- typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
/* Kernel blocking parameters */
static unsigned int out_height()
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
- cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
index d863b6c72a..a23c44b7da 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -32,10 +31,8 @@
namespace arm_gemm {
-void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
{
- ARM_COMPUTE_UNUSED(act);
-
struct KernelArgs
{
KernelArgs(
@@ -94,14 +91,14 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa041c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -119,11 +116,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n"
+ "ldnt1w { z17.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902620 // addha za0.s, p1/M, p1/M, z17.s\n"
+ ".inst 0xc0902621 // addha za1.s, p1/M, p1/M, z17.s\n"
+ ".inst 0xc0902622 // addha za2.s, p1/M, p1/M, z17.s\n"
+ ".inst 0xc0902623 // addha za3.s, p1/M, p1/M, z17.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -146,75 +143,75 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0408370 // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
- "ldnt1b { z7.b }, p1/Z, [x23]\n"
- ".inst 0xa041837c // ld1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ldnt1b { z13.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa0428360 // ld1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1b { z12.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+ "ldnt1b { z12.b }, p1/Z, [x23]\n"
+ ".inst 0xa1418370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1b { z23.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+ ".inst 0xa08c2640 // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
- ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
- ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
- ".inst 0xa0408370 // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa08d2780 // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
- "ldnt1b { z7.b }, p1/Z, [x23]\n"
- ".inst 0xa08d27a1 // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
- ".inst 0xa08d27c2 // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
- ".inst 0xa08d27e3 // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
- ".inst 0xa041837c // ld1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa08c2400 // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
- "ldnt1b { z13.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa08c2421 // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
- ".inst 0xa08c2442 // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
- ".inst 0xa08c2463 // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
- ".inst 0xa0428360 // ld1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1b { z12.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
- ".inst 0xa0972721 // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
- ".inst 0xa0972742 // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
- ".inst 0xa0972763 // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
- ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa08c26c1 // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+ ".inst 0xa08c2742 // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+ ".inst 0xa08c27c3 // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+ ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0852600 // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+ "ldnt1b { z12.b }, p1/Z, [x23]\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0852702 // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+ ".inst 0xa0852783 // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+ ".inst 0xa1418370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa0842460 // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+ "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa08424e1 // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+ ".inst 0xa0842562 // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+ ".inst 0xa08425e3 // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+ ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa0932440 // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+ ".inst 0xa09324c1 // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+ ".inst 0xa0932542 // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+ ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1b { z23.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
- ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
- ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
- ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
- ".inst 0xa08d2780 // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
- ".inst 0xa08d27a1 // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
- ".inst 0xa08d27c2 // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
- ".inst 0xa08d27e3 // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
- ".inst 0xa08c2400 // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
- ".inst 0xa08c2421 // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
- ".inst 0xa08c2442 // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
- ".inst 0xa08c2463 // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
- ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
- ".inst 0xa0972721 // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
- ".inst 0xa0972742 // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
- ".inst 0xa0972763 // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
+ ".inst 0xa08c2640 // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
+ ".inst 0xa08c26c1 // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+ ".inst 0xa08c2742 // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+ ".inst 0xa08c27c3 // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+ ".inst 0xa0852600 // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+ ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa0852702 // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+ ".inst 0xa0852783 // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+ ".inst 0xa0842460 // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+ ".inst 0xa08424e1 // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+ ".inst 0xa0842562 // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+ ".inst 0xa08425e3 // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+ ".inst 0xa0932440 // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+ ".inst 0xa09324c1 // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+ ".inst 0xa0932542 // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+ ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa0408370 // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa0408368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1b { z7.b }, p1/Z, [x23]\n"
+ "ld1b { z15.b }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
- ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
- ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
- ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+ ".inst 0xa08f2500 // smopa za0.s, p1/M, p1/M, z8.b, z15.b\n"
+ ".inst 0xa08f2521 // smopa za1.s, p1/M, p1/M, z9.b, z15.b\n"
+ ".inst 0xa08f2542 // smopa za2.s, p1/M, p1/M, z10.b, z15.b\n"
+ ".inst 0xa08f2563 // smopa za3.s, p1/M, p1/M, z11.b, z15.b\n"
"bgt 9b\n"
"10:" // K oddments: End
"tbz x16, #1, 14f\n"
@@ -222,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ ".inst 0xa060c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 29f\n"
@@ -248,12 +245,12 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa060c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
@@ -275,30 +272,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"and x20, x22, #0x3\n"
"cbz x21, 16f\n"
"15:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- "st1w { z28.s }, p0, [x26]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ "st1w { z8.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z29.s }, p0, [x26]\n"
+ "st1w { z9.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"add x12, x12, #0x4\n"
- "st1w { z30.s }, p0, [x26]\n"
+ "st1w { z10.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z31.s }, p0, [x26]\n"
+ "st1w { z11.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 15b\n"
"16:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 17f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- "st1w { z8.s }, p0, [x26]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
"subs x20, x20, #0x1\n"
- "st1w { z9.s }, p0, [x26]\n"
+ "st1w { z5.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 17f\n"
- "st1w { z10.s }, p0, [x26]\n"
+ "st1w { z6.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"17:" // Store to output array: Accumulator row 0 oddments: End
"subs x25, x25, x22\n"
@@ -310,30 +307,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"and x20, x22, #0x3\n"
"cbz x21, 19f\n"
"18:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- "st1w { z0.s }, p0, [x26]\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z1.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"add x12, x12, #0x4\n"
- "st1w { z2.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z3.s }, p0, [x26]\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 18b\n"
"19:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 20f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ "st1w { z20.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z21.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 20f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z22.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"20:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -345,30 +342,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"and x20, x22, #0x3\n"
"cbz x21, 22f\n"
"21:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ "st1w { z24.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"add x12, x12, #0x4\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z26.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z19.s }, p0, [x26]\n"
+ "st1w { z27.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 21b\n"
"22:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 23f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
- "st1w { z0.s }, p0, [x26]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 23f\n"
"subs x20, x20, #0x1\n"
- "st1w { z1.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 23f\n"
- "st1w { z2.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"23:" // Store to output array: Accumulator row 2 oddments: End
"subs x25, x25, x22\n"
@@ -380,44 +377,44 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
"and x20, x20, #0x3\n"
"cbz x21, 25f\n"
"24:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- "st1w { z12.s }, p0, [x26]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "st1w { z16.s }, p0, [x26]\n"
"add x26, x26, x23\n"
- "st1w { z13.s }, p0, [x26]\n"
+ "st1w { z17.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"add x12, x12, #0x4\n"
- "st1w { z14.s }, p0, [x26]\n"
+ "st1w { z18.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"cmp x12, x21, LSL #2\n"
- "st1w { z15.s }, p0, [x26]\n"
+ "st1w { z19.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"blt 24b\n"
"25:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 26f\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- "st1w { z16.s }, p0, [x26]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ "st1w { z12.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 26f\n"
"subs x20, x20, #0x1\n"
- "st1w { z17.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26]\n"
"add x26, x26, x23\n"
"beq 26f\n"
- "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z14.s }, p0, [x26]\n"
"26:" // Store to output array: Accumulator row 3 oddments: End
"27:" // Store to output array: End
"tbz x16, #0, 29f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"28:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -441,4 +438,3 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
index c7bd38d905..fb84883913 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
- cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
index d868ed2b67..96247d2db5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
- ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa041c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa042c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
- ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa042c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x13, x13, #16\n"
@@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa01cc299 // ldnt1w { z24.s-z27.s }, p8/Z, [x20, x28, LSL #2]\n"
- ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
- ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
- ".inst 0xc0902742 // addha za2.s, p1/M, p1/M, z26.s\n"
- ".inst 0xc0902763 // addha za3.s, p1/M, p1/M, z27.s\n"
+ ".inst 0xa11cc289 // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+ ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n"
+ ".inst 0xc09024a1 // addha za1.s, p1/M, p1/M, z5.s\n"
+ ".inst 0xc0902522 // addha za2.s, p1/M, p1/M, z9.s\n"
+ ".inst 0xc09025a3 // addha za3.s, p1/M, p1/M, z13.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x28\n"
"mov x21, x9\n"
@@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"madd x23, x28, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- "ld1b { z10.b }, p1/Z, [x25]\n"
- ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
- "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
- ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
- ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
- ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ ".inst 0xa1a42680 // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
- ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
- ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
- "ld1b { z10.b }, p1/Z, [x25]\n"
- ".inst 0xa1ac2600 // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
- ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1ad2601 // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
- ".inst 0xa1ae2602 // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
- ".inst 0xa1af2603 // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
- "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
- ".inst 0xa1b826a0 // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
- ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1b926a1 // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
- ".inst 0xa1ba26a2 // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
- ".inst 0xa1bb26a3 // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
- "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
- ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
- ".inst 0xa1a02660 // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
- ".inst 0xa1a12661 // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
- ".inst 0xa1a22662 // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
- ".inst 0xa1a32663 // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
- "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+ ".inst 0xa1a52681 // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa1a62682 // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa1a72683 // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ "ld1b { z20.b }, p1/Z, [x25]\n"
+ ".inst 0xa1b82560 // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1b92561 // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1ba2562 // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa1bb2563 // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa1bc2440 // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1bd2441 // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa1be2442 // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa1bf2443 // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xa1b025c0 // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa1b125c1 // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa1b225c2 // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa1b325c3 // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+ "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
"addvl x25, x25, #4\n"
- ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
"addvl x23, x23, #16\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
- ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
- ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
- ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
- ".inst 0xa1ac2600 // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
- ".inst 0xa1ad2601 // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
- ".inst 0xa1ae2602 // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
- ".inst 0xa1af2603 // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
- ".inst 0xa1b826a0 // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
- ".inst 0xa1b926a1 // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
- ".inst 0xa1ba26a2 // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
- ".inst 0xa1bb26a3 // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
- ".inst 0xa1a02660 // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
- ".inst 0xa1a12661 // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
- ".inst 0xa1a22662 // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
- ".inst 0xa1a32663 // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+ ".inst 0xa1a42680 // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+ ".inst 0xa1a52681 // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+ ".inst 0xa1a62682 // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+ ".inst 0xa1a72683 // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+ ".inst 0xa1b82560 // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+ ".inst 0xa1b92561 // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1ba2562 // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+ ".inst 0xa1bb2563 // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+ ".inst 0xa1bc2440 // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+ ".inst 0xa1bd2441 // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+ ".inst 0xa1be2442 // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+ ".inst 0xa1bf2443 // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+ ".inst 0xa1b025c0 // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+ ".inst 0xa1b125c1 // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+ ".inst 0xa1b225c2 // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+ ".inst 0xa1b325c3 // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- "ld1b { z10.b }, p1/Z, [x25]\n"
+ "ld1b { z16.b }, p1/Z, [x25]\n"
"subs x21, x21, #0x1\n"
"addvl x25, x25, #1\n"
- ".inst 0xa04086fc // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa04086e4 // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #4\n"
- ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
- ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
- ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
- ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ ".inst 0xa1a42600 // umopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
+ ".inst 0xa1a52601 // umopa za1.s, p1/M, p1/M, z16.b, z5.b\n"
+ ".inst 0xa1a62602 // umopa za2.s, p1/M, p1/M, z16.b, z6.b\n"
+ ".inst 0xa1a72603 // umopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- "ld1w { z14.s }, p1/Z, [x25]\n"
+ "ld1w { z15.s }, p1/Z, [x25]\n"
"addvl x25, x25, #1\n"
- ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
- ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
- ".inst 0xc09125c2 // addva za2.s, p1/M, p1/M, z14.s\n"
- ".inst 0xc09125c3 // addva za3.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125e0 // addva za0.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e1 // addva za1.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
"tbz x14, #1, 14f\n"
"tbz x14, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5b8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x13]\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa042c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
- ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
+ ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
"addvl x13, x13, #16\n"
- ".inst 0xa061c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x4, MUL VL]\n"
- ".inst 0xa062c578 // st1w { z24.s-z27.s }, pn9.b, [x11, #0x8, MUL VL]\n"
- ".inst 0xa063c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ ".inst 0xa061c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa062c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
"addvl x11, x11, #16\n"
"blt 11b\n"
"b 21f\n"
@@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
- ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa061c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa060c564 // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x8, MUL VL]\n"
- ".inst 0xa063c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ ".inst 0xa062c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
"addvl x11, x11, #16\n"
"blt 13b\n"
"b 21f\n"
@@ -277,17 +276,17 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"ldr x24, [%x[args], %[offsetof_C]]\n"
"add x24, x24, x28\n" // C += n
"sub x23, x10, x9\n"
- "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x22, [%x[args], %[offsetof_ldcb]]\n"
"madd x24, x9, x22, x24\n" // C += m * ldc
- "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
"ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x14, #2, 15f\n"
@@ -295,10 +294,10 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"add x21, x21, x28\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
+ ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
+ ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x20\n"
"whilelt p0.b, x28, x27\n"
@@ -311,22 +310,22 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"16:" // Store to output array: Accumulator row 0 loop
".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc1aca41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc1a4a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
- ".inst 0xc1ada41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
- ".inst 0xc1aea416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ ".inst 0xc1a5a41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+ ".inst 0xc1a6a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
"add x12, x12, #0x2\n"
"cmp x12, x21, LSL #1\n"
- ".inst 0xc1afa410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
- ".inst 0xc1a4a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
- ".inst 0xc1a5a23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
- ".inst 0xc1a6a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
- ".inst 0xc1a7a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
- ".inst 0xc1a1a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
- ".inst 0xc1a1a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
- ".inst 0xc1a1a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
- ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+ ".inst 0xc1a7a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+ ".inst 0xc1aca23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc1ada23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+ ".inst 0xc1aea236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ ".inst 0xc1afa230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+ ".inst 0xc1a0a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+ ".inst 0xc1a0a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
"uzp1 z19.b, z26.b, z28.b\n"
@@ -344,29 +343,29 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc0860002 // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc086000a // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
- ".inst 0xc1aca402 // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
- ".inst 0xc0860090 // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
- ".inst 0xc08600ca // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
- ".inst 0xc1ada418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
- ".inst 0xc1aea410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
- ".inst 0xc1afa40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
- ".inst 0xc1a4a222 // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
- ".inst 0xc1a5a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
- ".inst 0xc1a6a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
- ".inst 0xc1a7a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
- ".inst 0xc1a1a302 // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
- ".inst 0xc1a1a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
- ".inst 0xc1a1a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
- ".inst 0xc1b4c6a2 // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
- ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
- "uzp1 z23.b, z2.b, z24.b\n"
- ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ ".inst 0xc1a4a40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
+ ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600de // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1a5a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+ ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+ ".inst 0xc1a7a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+ ".inst 0xc1aca22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+ ".inst 0xc1ada238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+ ".inst 0xc1aea23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+ ".inst 0xc1afa23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+ ".inst 0xc1a0a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+ ".inst 0xc1a0a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+ ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+ ".inst 0xc1a0a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
- "uzp1 z16.b, z16.b, z10.b\n"
- "uzp1 z16.b, z23.b, z16.b\n"
+ ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+ "uzp1 z17.b, z10.b, z24.b\n"
+ ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z26.b, z30.b\n"
+ "uzp1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p0, [x24]\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
"19:" // Store to output array: End
@@ -374,14 +373,14 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"20:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
- ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa041c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
- ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x13, x13, #16\n"
@@ -405,4 +404,3 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
index 123405bd17..f8c375f9f5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
- cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
index cb0e9521e3..9a59799529 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
- ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc09026a1 // addha za1.s, p1/M, p1/M, z21.s\n"
- ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n"
- ".inst 0xc09026a3 // addha za3.s, p1/M, p1/M, z21.s\n"
+ ".inst 0xa00a4299 // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
+ ".inst 0xc0902702 // addha za2.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902723 // addha za3.s, p1/M, p1/M, z25.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -152,75 +151,75 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ ".inst 0xa1b12460 // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
- ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
- ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
- ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
- ".inst 0xa1a825c0 // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
- ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
- ".inst 0xa1a925c1 // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
- ".inst 0xa1a825e2 // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
- ".inst 0xa1a925e3 // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
- ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
- ".inst 0xa1b42400 // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
- ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xa1bc2401 // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
- ".inst 0xa1b42422 // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xa1a22480 // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
- ".inst 0xa1aa2481 // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
- ".inst 0xa1a224a2 // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
- ".inst 0xa1aa24a3 // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
- ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+ ".inst 0xa1b92461 // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa1b12562 // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa1b92563 // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa1b62680 // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+ ".inst 0xa1b72681 // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa1b62782 // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa1b72783 // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+ ".inst 0xa1b026a0 // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa1b826a1 // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa1b027a2 // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa1b827a3 // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1a724a0 // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa1af24a1 // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa1a725a2 // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa1af25a3 // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+ ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
"addvl x27, x27, #8\n"
- ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+ ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
"addvl x23, x23, #8\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
- ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
- ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
- ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
- ".inst 0xa1a825c0 // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
- ".inst 0xa1a925c1 // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
- ".inst 0xa1a825e2 // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
- ".inst 0xa1a925e3 // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
- ".inst 0xa1b42400 // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
- ".inst 0xa1bc2401 // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
- ".inst 0xa1b42422 // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
- ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
- ".inst 0xa1a22480 // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
- ".inst 0xa1aa2481 // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
- ".inst 0xa1a224a2 // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
- ".inst 0xa1aa24a3 // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+ ".inst 0xa1b12460 // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+ ".inst 0xa1b92461 // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+ ".inst 0xa1b12562 // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+ ".inst 0xa1b92563 // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+ ".inst 0xa1b62680 // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+ ".inst 0xa1b72681 // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+ ".inst 0xa1b62782 // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+ ".inst 0xa1b72783 // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+ ".inst 0xa1b026a0 // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+ ".inst 0xa1b826a1 // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa1b027a2 // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+ ".inst 0xa1b827a3 // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+ ".inst 0xa1a724a0 // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+ ".inst 0xa1af24a1 // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+ ".inst 0xa1a725a2 // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+ ".inst 0xa1af25a3 // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
- ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
+ ".inst 0xa1400773 // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #2\n"
".inst 0xa04006f0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
"addvl x23, x23, #2\n"
- ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
- ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
- ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
- ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ ".inst 0xa1b02660 // umopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+ ".inst 0xa1b12661 // umopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+ ".inst 0xa1b02762 // umopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+ ".inst 0xa1b12763 // umopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
"bgt 9b\n"
"10:" // K oddments: End
".inst 0xa040476e // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
@@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15]\n"
- ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
- ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
- ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
+ ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 24f\n"
@@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
- ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
- ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
- ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
- ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 24f\n"
@@ -277,13 +276,13 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"ldr x26, [%x[args], %[offsetof_C]]\n"
"add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
- "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
"madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
"ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
@@ -291,10 +290,10 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"add x21, x21, x10\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
+ ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"add x20, x20, x21, LSL #2\n"
- ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
+ ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.h, x10, x9\n"
@@ -305,26 +304,26 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
- ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"add x12, x12, #0x4\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
- ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
- ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
- ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
- ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z12.h, z28.h\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z8.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z13.h, z29.h\n"
- "uzp1 z17.h, z14.h, z30.h\n"
+ "uzp1 z16.h, z5.h, z9.h\n"
+ "uzp1 z17.h, z6.h, z10.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z15.h, z31.h\n"
+ "uzp1 z16.h, z7.h, z11.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -332,27 +331,27 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
- ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
- ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
- ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a0ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a0aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
- ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
- ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
- ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
- ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z28.h, z12.h\n"
+ ".inst 0xc1a2aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+ ".inst 0xc1a3aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z8.h, z4.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
"subs x20, x20, #0x1\n"
- "uzp1 z16.h, z29.h, z13.h\n"
+ "uzp1 z16.h, z9.h, z5.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
- "uzp1 z16.h, z30.h, z14.h\n"
+ "uzp1 z16.h, z10.h, z6.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"18:" // Store to output array: Accumulator row 0 oddments: End
@@ -367,25 +366,25 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
- ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
- ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
"add x12, x12, #0x4\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1abab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
- ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z4.h, z16.h\n"
+ ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z20.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z5.h, z17.h\n"
- "uzp1 z17.h, z6.h, z18.h\n"
+ "uzp1 z16.h, z5.h, z21.h\n"
+ "uzp1 z17.h, z6.h, z22.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
- "uzp1 z16.h, z7.h, z19.h\n"
+ "uzp1 z16.h, z7.h, z23.h\n"
"st1b { z17.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"st1b { z16.h }, p0, [x26]\n"
@@ -393,27 +392,27 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
"blt 19b\n"
"20:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 21f\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
- ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
- ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a0aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
- ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
- ".inst 0xc1abab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
- ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
- ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc1aeab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
- "uzp1 z16.h, z20.h, z16.h\n"
+ "uzp1 z16.h, z4.h, z16.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
"subs x20, x20, #0x1\n"
- "uzp1 z16.h, z21.h, z17.h\n"
+ "uzp1 z16.h, z5.h, z17.h\n"
"st1b { z16.h }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
- "uzp1 z16.h, z22.h, z18.h\n"
+ "uzp1 z16.h, z6.h, z18.h\n"
"st1b { z16.h }, p0, [x26]\n"
"21:" // Store to output array: Accumulator row 1 oddments: End
"22:" // Store to output array: End
@@ -452,4 +451,3 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
index 2e61cf49a8..04d19324c5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
#include <cstdint>
#include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@ public:
StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
- cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *ci)
+ cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *)
{
- ARM_COMPUTE_UNUSED(ci);
}
};
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
index 8f8886b876..0f3346e65e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifdef __ARM_FEATURE_SVE
#ifdef ARM_COMPUTE_ENABLE_SME2
#include "arm_gemm.hpp"
@@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"1:" // Initial accumulator load from buffer: Loop
- ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
- ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
- ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
- ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"ldr x20, [%x[args], %[offsetof_bias]]\n"
".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
"cbz x20, 5f\n"
- "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n"
- ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n"
- ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n"
+ "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+ ".inst 0xc0902500 // addha za0.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902501 // addha za1.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902502 // addha za2.s, p1/M, p1/M, z8.s\n"
+ ".inst 0xc0902503 // addha za3.s, p1/M, p1/M, z8.s\n"
"4:" // Prepare accumulators: Test for last block
"mov x20, x10\n"
"mov x21, x11\n"
@@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes
"cbz x22, 8f\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- "ldnt1b { z0.b }, p1/Z, [x23]\n"
- ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"ble 7f\n"
"6:" // K loop
- ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ ".inst 0xa1ae2480 // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
"subs x22, x22, #0x1\n"
- ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
- ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
- ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
- ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
- ".inst 0xa1a92660 // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
- "ldnt1b { z0.b }, p1/Z, [x23]\n"
- ".inst 0xa1a926e1 // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
- ".inst 0xa1a92762 // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
- ".inst 0xa1a927e3 // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
- ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
- ".inst 0xa1b52600 // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
- "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
- ".inst 0xa1b52681 // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
- ".inst 0xa1b52702 // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
- ".inst 0xa1b52783 // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
- ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
- "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
- ".inst 0xa1ac2440 // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
- ".inst 0xa1ac24c1 // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
- ".inst 0xa1ac2542 // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
- ".inst 0xa1ac25c3 // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
- ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+ ".inst 0xa1ae24a1 // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa1ae24c2 // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa1ae24e3 // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+ ".inst 0xa1bf2680 // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ "ldnt1b { z14.b }, p1/Z, [x23]\n"
+ ".inst 0xa1bf26a1 // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa1bf26c2 // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa1bf26e3 // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+ ".inst 0xa1ad2700 // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa1ad2721 // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa1ad2742 // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa1ad2763 // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa1bd2500 // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa1bd2521 // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa1bd2542 // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1bd2563 // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+ ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
"addvl x27, x27, #16\n"
- "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+ "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
"addvl x23, x23, #4\n"
"bgt 6b\n"
"7:" // K loop tail
- ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
- ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
- ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
- ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
- ".inst 0xa1a92660 // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
- ".inst 0xa1a926e1 // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
- ".inst 0xa1a92762 // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
- ".inst 0xa1a927e3 // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
- ".inst 0xa1b52600 // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
- ".inst 0xa1b52681 // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
- ".inst 0xa1b52702 // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
- ".inst 0xa1b52783 // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
- ".inst 0xa1ac2440 // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
- ".inst 0xa1ac24c1 // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
- ".inst 0xa1ac2542 // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
- ".inst 0xa1ac25c3 // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+ ".inst 0xa1ae2480 // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+ ".inst 0xa1ae24a1 // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+ ".inst 0xa1ae24c2 // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+ ".inst 0xa1ae24e3 // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+ ".inst 0xa1bf2680 // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+ ".inst 0xa1bf26a1 // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+ ".inst 0xa1bf26c2 // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+ ".inst 0xa1bf26e3 // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+ ".inst 0xa1ad2700 // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+ ".inst 0xa1ad2721 // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+ ".inst 0xa1ad2742 // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+ ".inst 0xa1ad2763 // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+ ".inst 0xa1bd2500 // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+ ".inst 0xa1bd2521 // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+ ".inst 0xa1bd2542 // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1bd2563 // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
"8:" // K oddments
"cbz x21, 10f\n"
"9:" // K oddments: Loop
".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
"subs x21, x21, #0x1\n"
"addvl x27, x27, #4\n"
- "ld1b { z0.b }, p1/Z, [x23]\n"
+ "ld1b { z15.b }, p1/Z, [x23]\n"
"addvl x23, x23, #1\n"
- ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
- ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
- ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
- ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ ".inst 0xa1af2640 // umopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
+ ".inst 0xa1af26c1 // umopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
+ ".inst 0xa1af2742 // umopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
+ ".inst 0xa1af27c3 // umopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
"bgt 9b\n"
"10:" // K oddments: End
- ".inst 0xa040c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27]\n"
+ ".inst 0xa140c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
"addvl x27, x27, #4\n"
- ".inst 0xc0912400 // addva za0.s, p1/M, p1/M, z0.s\n"
- ".inst 0xc0912421 // addva za1.s, p1/M, p1/M, z1.s\n"
- ".inst 0xc0912442 // addva za2.s, p1/M, p1/M, z2.s\n"
- ".inst 0xc0912463 // addva za3.s, p1/M, p1/M, z3.s\n"
+ ".inst 0xc0912460 // addva za0.s, p1/M, p1/M, z3.s\n"
+ ".inst 0xc09124e1 // addva za1.s, p1/M, p1/M, z7.s\n"
+ ".inst 0xc0912562 // addva za2.s, p1/M, p1/M, z11.s\n"
+ ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
"tbz x16, #1, 14f\n"
"tbz x16, #0, 12f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"11:" // Store to partial result buffer: Store and refill: Loop
- ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
- ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
- ".inst 0xa041c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
- ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
- ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
"addvl x15, x15, #16\n"
- ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
- ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+ ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 11b\n"
"b 30f\n"
@@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"mov x12, #0x0\n"
"cntw x20\n"
"13:" // Store to partial result buffer: Store only: Loop
- ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
- ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
- ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
- ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
- ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
- ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+ ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+ ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
"addvl x14, x14, #16\n"
"blt 13b\n"
"b 30f\n"
@@ -277,22 +276,22 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"ldr x26, [%x[args], %[offsetof_C]]\n"
"add x26, x26, x10\n" // C += n
"sub x25, x13, x11\n"
- "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
"ldr x24, [%x[args], %[offsetof_ldcb]]\n"
"madd x26, x11, x24, x26\n" // C += m * ldc
- "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
"tbz x16, #2, 15f\n"
"ldr w21, [%x[args], %[offsetof_n_0]]\n"
"add x21, x21, x10\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
"add x20, x20, x21, LSL #2\n"
- "ld1w { z8.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
"ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"add x20, x20, x21, LSL #2\n"
- "ld1w { z7.s }, p0/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
"15:" // Store to output array: Load per-channel parameters: End
"cntw x23\n"
"whilelt p0.s, x10, x9\n"
@@ -303,30 +302,30 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"and x20, x22, #0x3\n"
"cbz x21, 17f\n"
"16:" // Store to output array: Accumulator row 0 loop
- ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
- ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
- "st1b { z12.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ "st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z13.s }, p0, [x26]\n"
+ "st1b { z17.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z14.s }, p0, [x26]\n"
+ "st1b { z18.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z15.s }, p0, [x26]\n"
+ "st1b { z19.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 16b\n"
"17:" // Store to output array: Accumulator row 0 oddments
"cbz x20, 18f\n"
".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
- ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
- ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+ ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
"st1b { z16.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 18f\n"
@@ -347,38 +346,38 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"and x20, x22, #0x3\n"
"cbz x21, 20f\n"
"19:" // Store to output array: Accumulator row 1 loop
- ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
- "st1b { z16.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z17.s }, p0, [x26]\n"
+ "st1b { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z18.s }, p0, [x26]\n"
+ "st1b { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z19.s }, p0, [x26]\n"
+ "st1b { z7.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 19b\n"
"20:" // Store to output array: Accumulator row 1 oddments
"cbz x20, 21f\n"
- ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
- ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
- ".inst 0xc1a4ccbc // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
- "st1b { z28.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ "st1b { z4.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
"subs x20, x20, #0x1\n"
- "st1b { z29.s }, p0, [x26]\n"
+ "st1b { z5.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 21f\n"
- "st1b { z30.s }, p0, [x26]\n"
+ "st1b { z6.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"21:" // Store to output array: Accumulator row 1 oddments: End
"subs x25, x25, x22\n"
@@ -391,30 +390,30 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"and x20, x22, #0x3\n"
"cbz x21, 23f\n"
"22:" // Store to output array: Accumulator row 2 loop
- ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
- ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
- ".inst 0xc1a4ccb8 // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
- "st1b { z24.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+ ".inst 0xc1b4cea8 // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ "st1b { z8.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z25.s }, p0, [x26]\n"
+ "st1b { z9.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z26.s }, p0, [x26]\n"
+ "st1b { z10.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z27.s }, p0, [x26]\n"
+ "st1b { z11.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 22b\n"
"23:" // Store to output array: Accumulator row 2 oddments
"cbz x20, 24f\n"
".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
- ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
- ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
"st1b { z12.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 24f\n"
@@ -435,52 +434,52 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
"and x20, x20, #0x3\n"
"cbz x21, 26f\n"
"25:" // Store to output array: Accumulator row 3 loop
- ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"add x12, x12, #0x4\n"
- ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
"cmp x12, x21, LSL #2\n"
- ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
- ".inst 0xc1a4ccb4 // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
- "st1b { z20.s }, p0, [x26]\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z21.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z22.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z23.s }, p0, [x26]\n"
+ "st1b { z31.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"blt 25b\n"
"26:" // Store to output array: Accumulator row 3 oddments
"cbz x20, 27f\n"
- ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
- ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
"subs x20, x20, #0x1\n"
- ".inst 0xc1a7aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
- ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- ".inst 0xc1a4cca0 // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
- "st1b { z0.s }, p0, [x26]\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+ "st1b { z28.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
"subs x20, x20, #0x1\n"
- "st1b { z1.s }, p0, [x26]\n"
+ "st1b { z29.s }, p0, [x26]\n"
"add x26, x26, x24\n"
"beq 27f\n"
- "st1b { z2.s }, p0, [x26]\n"
+ "st1b { z30.s }, p0, [x26]\n"
"27:" // Store to output array: Accumulator row 3 oddments: End
"28:" // Store to output array: End
"tbz x16, #0, 30f\n"
"mov x12, #0x0\n"
"cntw x20\n"
"29:" // Store to output array: Refill accumulators: Loop
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
- ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
- ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
"add x12, x12, #0x4\n"
"cmp x12, x20\n"
"addvl x15, x15, #16\n"
@@ -504,4 +503,3 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin
} // namespace arm_gemm
#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
index e07fa549f3..1ce169d562 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
index 13f2e488dd..9136e32567 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -157,16 +157,16 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 6f\n"
"4:" // Height 1: no bias
"tbz %x[flags], #0, 5f\n"
- "ld1w { z9.s }, p4/Z, [x13]\n"
- "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x13]\n"
+ "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "zip1 z8.d, z16.d, z12.d\n"
+ "zip2 z12.d, z16.d, z12.d\n"
+ "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 6f\n"
@@ -184,11 +184,11 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 9f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -200,43 +200,43 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
+ ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
+ ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6470e68a // bfmmla z10.s, z20.h, z16.h\n"
+ ".inst 0x6471e68e // bfmmla z14.s, z20.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
+ ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
"add x26, x26, #0x10\n"
"addvl x12, x12, #4\n"
"addvl x11, x11, #4\n"
@@ -246,46 +246,46 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
"addvl x10, x10, #2\n"
"addvl x9, x9, #2\n"
"ble 12f\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n"
+ ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
"addvl x10, x10, #2\n"
@@ -301,17 +301,17 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 13f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"13:" // Height 1: No activation
"st1w { z8.s }, p4, [x13]\n"
"st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -376,21 +376,21 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x13]\n"
- "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "add x20, x13, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
+ "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 20f\n"
@@ -408,12 +408,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -421,50 +421,50 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 23f\n"
"22:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"23:" // Height 2: input setup done
"cmp x27, #0x8\n"
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "ld1rqh { z19.h }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
+ ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
+ ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n"
+ ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
+ ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"addvl x12, x12, #4\n"
@@ -475,47 +475,47 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1rqh { z19.h }, p0/Z, [x25]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
"addvl x10, x10, #2\n"
"addvl x9, x9, #2\n"
"ble 26f\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x11]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z22.h }, p5/Z, [x9]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6476e42b // bfmmla z11.s, z1.h, z22.h\n"
+ ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
"addvl x10, x10, #2\n"
@@ -537,25 +537,25 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp2 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z17.s\n"
+ "fmin z12.s, p5/M, z12.s, z17.s\n"
+ "fmin z13.s, p5/M, z13.s, z17.s\n"
+ "fmin z14.s, p5/M, z14.s, z17.s\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z7.s, p5/M, z7.s, z16.s\n"
+ "fmax z12.s, p5/M, z12.s, z16.s\n"
+ "fmax z13.s, p5/M, z13.s, z16.s\n"
+ "fmax z14.s, p5/M, z14.s, z16.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"27:" // Height 2: No activation
"st1w { z7.s }, p4, [x13]\n"
"st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -632,28 +632,28 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x13]\n"
- "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "add x21, x13, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
+ "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -685,13 +685,13 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 37f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -700,145 +700,145 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 37f\n"
"36:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"37:" // Height 3: input setup done
"cmp x27, #0x8\n"
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "ld1rqh { z28.h }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
+ ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9]\n"
"cmp x27, #0x8\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
+ ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
+ ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
"addvl x12, x12, #4\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
+ ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
+ ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
+ ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
+ ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
+ ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
+ ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
+ ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
+ "trn1 z27.d, z1.d, z24.d\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
+ ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
+ ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
"addvl x10, x10, #2\n"
"addvl x9, x9, #2\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
+ ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 40f\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
+ ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
+ ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
+ ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
+ ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n"
+ ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n"
+ ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n"
+ ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x9, x9, #2\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n"
+ ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n"
+ ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n"
+ ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n"
"40:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -861,33 +861,33 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 41f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z25.s\n"
+ "fmin z12.s, p5/M, z12.s, z25.s\n"
+ "fmin z13.s, p5/M, z13.s, z25.s\n"
+ "fmin z14.s, p5/M, z14.s, z25.s\n"
+ "fmin z8.s, p5/M, z8.s, z25.s\n"
+ "fmin z9.s, p5/M, z9.s, z25.s\n"
+ "fmin z10.s, p5/M, z10.s, z25.s\n"
+ "fmin z11.s, p5/M, z11.s, z25.s\n"
+ "fmin z16.s, p5/M, z16.s, z25.s\n"
+ "fmin z17.s, p5/M, z17.s, z25.s\n"
+ "fmin z18.s, p5/M, z18.s, z25.s\n"
+ "fmin z19.s, p5/M, z19.s, z25.s\n"
+ "fmax z7.s, p5/M, z7.s, z24.s\n"
+ "fmax z12.s, p5/M, z12.s, z24.s\n"
+ "fmax z13.s, p5/M, z13.s, z24.s\n"
+ "fmax z14.s, p5/M, z14.s, z24.s\n"
+ "fmax z8.s, p5/M, z8.s, z24.s\n"
+ "fmax z9.s, p5/M, z9.s, z24.s\n"
+ "fmax z10.s, p5/M, z10.s, z24.s\n"
+ "fmax z11.s, p5/M, z11.s, z24.s\n"
+ "fmax z16.s, p5/M, z16.s, z24.s\n"
+ "fmax z17.s, p5/M, z17.s, z24.s\n"
+ "fmax z18.s, p5/M, z18.s, z24.s\n"
+ "fmax z19.s, p5/M, z19.s, z24.s\n"
"41:" // Height 3: No activation
"st1w { z7.s }, p4, [x13]\n"
"st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -968,37 +968,37 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x13]\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "add x22, x13, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -1026,14 +1026,14 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1043,149 +1043,149 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 51f\n"
"50:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"51:" // Height 4: input setup done
"cmp x27, #0x8\n"
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
+ "ld1rqh { z28.h }, p0/Z, [x24]\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9]\n"
"cmp x27, #0x8\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
+ ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
+ ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
+ ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"addvl x11, x11, #4\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
+ ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
+ ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
+ ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
+ ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
+ ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
"addvl x10, x10, #2\n"
"addvl x9, x9, #2\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 54f\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
+ ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
+ ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
+ ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x11]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
+ ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n"
+ ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n"
+ ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x9]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n"
+ ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x9, x9, #2\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n"
+ ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n"
+ ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n"
+ ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n"
"54:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1213,41 +1213,41 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp2 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 55f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z23.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z24.s\n"
+ "fmin z12.s, p5/M, z12.s, z24.s\n"
+ "fmin z13.s, p5/M, z13.s, z24.s\n"
+ "fmin z14.s, p5/M, z14.s, z24.s\n"
+ "fmin z8.s, p5/M, z8.s, z24.s\n"
+ "fmin z9.s, p5/M, z9.s, z24.s\n"
+ "fmin z10.s, p5/M, z10.s, z24.s\n"
+ "fmin z11.s, p5/M, z11.s, z24.s\n"
+ "fmin z15.s, p5/M, z15.s, z24.s\n"
+ "fmin z20.s, p5/M, z20.s, z24.s\n"
+ "fmin z21.s, p5/M, z21.s, z24.s\n"
+ "fmin z22.s, p5/M, z22.s, z24.s\n"
+ "fmin z16.s, p5/M, z16.s, z24.s\n"
+ "fmin z17.s, p5/M, z17.s, z24.s\n"
+ "fmin z18.s, p5/M, z18.s, z24.s\n"
+ "fmin z19.s, p5/M, z19.s, z24.s\n"
+ "fmax z7.s, p5/M, z7.s, z23.s\n"
+ "fmax z12.s, p5/M, z12.s, z23.s\n"
+ "fmax z13.s, p5/M, z13.s, z23.s\n"
+ "fmax z14.s, p5/M, z14.s, z23.s\n"
+ "fmax z8.s, p5/M, z8.s, z23.s\n"
+ "fmax z9.s, p5/M, z9.s, z23.s\n"
+ "fmax z10.s, p5/M, z10.s, z23.s\n"
+ "fmax z11.s, p5/M, z11.s, z23.s\n"
+ "fmax z15.s, p5/M, z15.s, z23.s\n"
+ "fmax z20.s, p5/M, z20.s, z23.s\n"
+ "fmax z21.s, p5/M, z21.s, z23.s\n"
+ "fmax z22.s, p5/M, z22.s, z23.s\n"
+ "fmax z16.s, p5/M, z16.s, z23.s\n"
+ "fmax z17.s, p5/M, z17.s, z23.s\n"
+ "fmax z18.s, p5/M, z18.s, z23.s\n"
+ "fmax z19.s, p5/M, z19.s, z23.s\n"
"55:" // Height 4: No activation
"st1w { z7.s }, p4, [x13]\n"
"st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -1340,54 +1340,54 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"60:" // Height 5: no bias
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x13]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x13, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 62f\n"
"61:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1419,15 +1419,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"63:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 65f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1438,189 +1438,189 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 65f\n"
"64:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"65:" // Height 5: input setup done
"cmp x27, #0x8\n"
"ble 67f\n"
"66:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
+ "ld1rqh { z6.h }, p0/Z, [x26]\n"
+ "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z7.h }, p0/Z, [x24]\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "trn1 z3.d, z7.d, z2.d\n"
+ "trn2 z7.d, z7.d, z2.d\n"
+ "ld1h { z1.h }, p5/Z, [x12]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
+ "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6461e4a8 // bfmmla z8.s, z5.h, z1.h\n"
+ ".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x11]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
+ ".inst 0x6460e4ac // bfmmla z12.s, z5.h, z0.h\n"
+ ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
+ ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
+ ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
+ ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n"
+ ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
+ ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n"
+ ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
+ ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x9]\n"
+ ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n"
+ ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
+ ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
+ ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
"addvl x12, x12, #4\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
+ ".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n"
+ ".inst 0x6461e498 // bfmmla z24.s, z4.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
"addvl x11, x11, #4\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
+ ".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
"addvl x10, x10, #4\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
+ ".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n"
+ ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
"addvl x9, x9, #4\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n"
+ ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 66b\n"
"67:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1h { z2.h }, p5/Z, [x12]\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
+ ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x11]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
+ ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n"
+ ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n"
"addvl x10, x10, #2\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
+ ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x9]\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
"addvl x9, x9, #2\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
+ ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
+ ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"ble 68f\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
+ "ld1h { z2.h }, p5/Z, [x12]\n"
+ "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
+ ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
+ ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x11]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
+ ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
+ ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
+ ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x9]\n"
+ ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x9, x9, #2\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n"
+ ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n"
+ ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n"
+ ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n"
+ ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"68:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1653,49 +1653,49 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"uzp1 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 69f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z24.s, p5/M, z24.s, z1.s\n"
- "fmin z25.s, p5/M, z25.s, z1.s\n"
- "fmin z26.s, p5/M, z26.s, z1.s\n"
- "fmin z27.s, p5/M, z27.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z24.s, p5/M, z24.s, z0.s\n"
- "fmax z25.s, p5/M, z25.s, z0.s\n"
- "fmax z26.s, p5/M, z26.s, z0.s\n"
- "fmax z27.s, p5/M, z27.s, z0.s\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z23.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z7.s, p5/M, z7.s, z23.s\n"
+ "fmax z12.s, p5/M, z12.s, z23.s\n"
+ "fmax z13.s, p5/M, z13.s, z23.s\n"
+ "fmax z14.s, p5/M, z14.s, z23.s\n"
+ "fmax z8.s, p5/M, z8.s, z23.s\n"
+ "fmax z9.s, p5/M, z9.s, z23.s\n"
+ "fmax z10.s, p5/M, z10.s, z23.s\n"
+ "fmax z11.s, p5/M, z11.s, z23.s\n"
+ "fmax z15.s, p5/M, z15.s, z23.s\n"
+ "fmax z20.s, p5/M, z20.s, z23.s\n"
+ "fmax z21.s, p5/M, z21.s, z23.s\n"
+ "fmax z22.s, p5/M, z22.s, z23.s\n"
+ "fmax z16.s, p5/M, z16.s, z23.s\n"
+ "fmax z17.s, p5/M, z17.s, z23.s\n"
+ "fmax z18.s, p5/M, z18.s, z23.s\n"
+ "fmax z19.s, p5/M, z19.s, z23.s\n"
+ "fmax z24.s, p5/M, z24.s, z23.s\n"
+ "fmax z25.s, p5/M, z25.s, z23.s\n"
+ "fmax z26.s, p5/M, z26.s, z23.s\n"
+ "fmax z27.s, p5/M, z27.s, z23.s\n"
"69:" // Height 5: No activation
"st1w { z7.s }, p4, [x13]\n"
"st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -1795,59 +1795,59 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"74:" // Height 6: no bias
"tbz %x[flags], #0, 75f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x13]\n"
+ "add x24, x13, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x13]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
"zip2 z15.d, z16.d, z15.d\n"
"zip1 z16.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
"zip1 z18.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 76f\n"
"75:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1879,16 +1879,16 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"77:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 78f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 79f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1900,193 +1900,193 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL (
"b 79f\n"
"78:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"79:" // Height 6: input setup done
"cmp x27, #0x8\n"
"ble 81f\n"
"80:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
- "ld1rqh { z6.h }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "ld1rqh { z0.h }, p0/Z, [x21]\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1h { z1.h }, p5/Z, [x12]\n"
+ "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x11]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
+ ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
+ ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
"add x21, x21, #0x10\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
+ ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x9]\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
+ ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
+ ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
"addvl x12, x12, #4\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
"addvl x11, x11, #4\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
"addvl x10, x10, #4\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
"addvl x9, x9, #4\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"bgt 80b\n"
"81:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
- "ld1rqh { z6.h }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
+ "ld1rqh { z0.h }, p0/Z, [x21]\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z2.h }, p5/Z, [x12]\n"
+ "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
+ ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x11]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
+ ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n"
+ ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n"
"addvl x10, x10, #2\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
+ ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x9]\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
"addvl x9, x9, #2\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
+ ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
+ ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"ble 82f\n"
- "ld1h { z7.h }, p5/Z, [x12]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
+ "ld1h { z2.h }, p5/Z, [x12]\n"
+ "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
+ ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
+ ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x11]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
+ ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
+ ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
+ ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x9]\n"
+ ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
"addvl x9, x9, #2\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n"
+ ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n"
+ ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n"
+ ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n"
+ ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"82:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
index acbc619eed..c42ad7e879 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
index 5f093bf08a..66601bd312 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -163,11 +163,11 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 9f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -183,12 +183,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"10:" // Height 1: Multiply loop: Main loop
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z17.h }, p4/Z, [x10]\n"
+ "ld1h { z16.h }, p4/Z, [x9]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
"add x26, x26, #0x2\n"
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -201,12 +201,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z17.h }, p4/Z, [x10]\n"
+ "ld1h { z16.h }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
@@ -214,17 +214,17 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"bne 7b\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
+ "ld1rh { z16.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z17.h\n"
+ "fmin z9.h, p4/M, z9.h, z17.h\n"
+ "fmin z10.h, p4/M, z10.h, z17.h\n"
+ "fmin z11.h, p4/M, z11.h, z17.h\n"
+ "fmax z8.h, p4/M, z8.h, z16.h\n"
+ "fmax z9.h, p4/M, z9.h, z16.h\n"
+ "fmax z10.h, p4/M, z10.h, z16.h\n"
+ "fmax z11.h, p4/M, z11.h, z16.h\n"
"12:" // Height 1: No activation
"st1h { z8.h }, p3, [x13]\n"
"st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -285,15 +285,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"17:" // Height 2: no bias
"tbz %x[flags], #0, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
+ "add x20, x13, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x20]\n"
+ "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 19f\n"
"18:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -309,12 +309,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"20:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 21f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 22f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -322,7 +322,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 22f\n"
"21:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"22:" // Height 2: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -333,19 +333,19 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"23:" // Height 2: Multiply loop: Main loop
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z17.h }, p4/Z, [x10]\n"
"addvl x12, x12, #1\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z16.h }, p4/Z, [x9]\n"
"addvl x11, x11, #1\n"
"add x26, x26, #0x2\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z14.h, p4/M, z17.h, z1.h\n"
"add x25, x25, #0x2\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
+ "fmla z15.h, p4/M, z16.h, z1.h\n"
"addvl x10, x10, #1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
@@ -357,18 +357,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z17.h }, p4/Z, [x10]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z16.h }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z14.h, p4/M, z17.h, z1.h\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
+ "fmla z15.h, p4/M, z16.h, z1.h\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"bne 20b\n"
@@ -376,25 +376,25 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x25, x13, x20, LSL #1\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
+ "ld1rh { z16.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z17.h\n"
+ "fmin z9.h, p4/M, z9.h, z17.h\n"
+ "fmin z10.h, p4/M, z10.h, z17.h\n"
+ "fmin z11.h, p4/M, z11.h, z17.h\n"
+ "fmin z12.h, p4/M, z12.h, z17.h\n"
+ "fmin z13.h, p4/M, z13.h, z17.h\n"
+ "fmin z14.h, p4/M, z14.h, z17.h\n"
+ "fmin z15.h, p4/M, z15.h, z17.h\n"
+ "fmax z8.h, p4/M, z8.h, z16.h\n"
+ "fmax z9.h, p4/M, z9.h, z16.h\n"
+ "fmax z10.h, p4/M, z10.h, z16.h\n"
+ "fmax z11.h, p4/M, z11.h, z16.h\n"
+ "fmax z12.h, p4/M, z12.h, z16.h\n"
+ "fmax z13.h, p4/M, z13.h, z16.h\n"
+ "fmax z14.h, p4/M, z14.h, z16.h\n"
+ "fmax z15.h, p4/M, z15.h, z16.h\n"
"25:" // Height 2: No activation
"st1h { z8.h }, p3, [x13]\n"
"st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -463,20 +463,20 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"30:" // Height 3: no bias
"tbz %x[flags], #0, 31f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x21, x13, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x21]\n"
+ "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x20]\n"
+ "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 32f\n"
"31:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -496,13 +496,13 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"33:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 34f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 35f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -511,8 +511,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 35f\n"
"34:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"35:" // Height 3: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -528,22 +528,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"addvl x11, x11, #1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z21.h }, p4/Z, [x10]\n"
"add x26, x26, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z20.h }, p4/Z, [x9]\n"
"subs x27, x27, #0x1\n"
"add x25, x25, #0x2\n"
"add x24, x24, #0x2\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z21.h, z0.h\n"
+ "fmla z14.h, p4/M, z21.h, z1.h\n"
+ "fmla z18.h, p4/M, z21.h, z2.h\n"
+ "fmla z11.h, p4/M, z20.h, z0.h\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
+ "fmla z15.h, p4/M, z20.h, z1.h\n"
+ "fmla z19.h, p4/M, z20.h, z2.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
@@ -557,54 +557,54 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z21.h }, p4/Z, [x10]\n"
"cmp x28, x20\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z20.h }, p4/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z21.h, z0.h\n"
+ "fmla z14.h, p4/M, z21.h, z1.h\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z18.h, p4/M, z21.h, z2.h\n"
+ "fmla z11.h, p4/M, z20.h, z0.h\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
+ "fmla z15.h, p4/M, z20.h, z1.h\n"
+ "fmla z19.h, p4/M, z20.h, z2.h\n"
"bne 33b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x13, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z21.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmin z16.h, p4/M, z16.h, z1.h\n"
- "fmin z17.h, p4/M, z17.h, z1.h\n"
- "fmin z18.h, p4/M, z18.h, z1.h\n"
- "fmin z19.h, p4/M, z19.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
- "fmax z16.h, p4/M, z16.h, z0.h\n"
- "fmax z17.h, p4/M, z17.h, z0.h\n"
- "fmax z18.h, p4/M, z18.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z0.h\n"
+ "ld1rh { z20.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z21.h\n"
+ "fmin z9.h, p4/M, z9.h, z21.h\n"
+ "fmin z10.h, p4/M, z10.h, z21.h\n"
+ "fmin z11.h, p4/M, z11.h, z21.h\n"
+ "fmin z12.h, p4/M, z12.h, z21.h\n"
+ "fmin z13.h, p4/M, z13.h, z21.h\n"
+ "fmin z14.h, p4/M, z14.h, z21.h\n"
+ "fmin z15.h, p4/M, z15.h, z21.h\n"
+ "fmin z16.h, p4/M, z16.h, z21.h\n"
+ "fmin z17.h, p4/M, z17.h, z21.h\n"
+ "fmin z18.h, p4/M, z18.h, z21.h\n"
+ "fmin z19.h, p4/M, z19.h, z21.h\n"
+ "fmax z8.h, p4/M, z8.h, z20.h\n"
+ "fmax z9.h, p4/M, z9.h, z20.h\n"
+ "fmax z10.h, p4/M, z10.h, z20.h\n"
+ "fmax z11.h, p4/M, z11.h, z20.h\n"
+ "fmax z12.h, p4/M, z12.h, z20.h\n"
+ "fmax z13.h, p4/M, z13.h, z20.h\n"
+ "fmax z14.h, p4/M, z14.h, z20.h\n"
+ "fmax z15.h, p4/M, z15.h, z20.h\n"
+ "fmax z16.h, p4/M, z16.h, z20.h\n"
+ "fmax z17.h, p4/M, z17.h, z20.h\n"
+ "fmax z18.h, p4/M, z18.h, z20.h\n"
+ "fmax z19.h, p4/M, z19.h, z20.h\n"
"38:" // Height 3: No activation
"st1h { z8.h }, p3, [x13]\n"
"st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -681,25 +681,25 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"43:" // Height 4: no bias
"tbz %x[flags], #0, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x22, x13, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x13]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p3/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x22]\n"
+ "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x21]\n"
+ "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x20]\n"
+ "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 45f\n"
"44:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -723,14 +723,14 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -740,9 +740,9 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 48f\n"
"47:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"48:" // Height 4: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -759,7 +759,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"addvl x11, x11, #1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z25.h }, p4/Z, [x10]\n"
"add x26, x26, #0x2\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
@@ -767,22 +767,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x25, x25, #0x2\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z24.h }, p4/Z, [x9]\n"
"add x24, x24, #0x2\n"
"add x23, x23, #0x2\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z25.h, z0.h\n"
+ "fmla z14.h, p4/M, z25.h, z1.h\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
+ "fmla z18.h, p4/M, z25.h, z2.h\n"
+ "fmla z22.h, p4/M, z25.h, z3.h\n"
"addvl x9, x9, #1\n"
"ld1h { z6.h }, p4/Z, [x12]\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
+ "fmla z11.h, p4/M, z24.h, z0.h\n"
+ "fmla z15.h, p4/M, z24.h, z1.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
+ "fmla z19.h, p4/M, z24.h, z2.h\n"
+ "fmla z23.h, p4/M, z24.h, z3.h\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
"ld1rh { z3.h }, p4/Z, [x23]\n"
"ld1h { z7.h }, p4/Z, [x11]\n"
@@ -794,7 +794,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z25.h }, p4/Z, [x10]\n"
"cmp x28, x20\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
@@ -802,17 +802,17 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"addvl x11, x11, #1\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z24.h }, p4/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z25.h, z0.h\n"
+ "fmla z14.h, p4/M, z25.h, z1.h\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
+ "fmla z18.h, p4/M, z25.h, z2.h\n"
+ "fmla z22.h, p4/M, z25.h, z3.h\n"
+ "fmla z11.h, p4/M, z24.h, z0.h\n"
+ "fmla z15.h, p4/M, z24.h, z1.h\n"
+ "fmla z19.h, p4/M, z24.h, z2.h\n"
+ "fmla z23.h, p4/M, z24.h, z3.h\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x13, x20, LSL #1\n"
@@ -820,41 +820,41 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z25.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmin z16.h, p4/M, z16.h, z1.h\n"
- "fmin z17.h, p4/M, z17.h, z1.h\n"
- "fmin z18.h, p4/M, z18.h, z1.h\n"
- "fmin z19.h, p4/M, z19.h, z1.h\n"
- "fmin z20.h, p4/M, z20.h, z1.h\n"
- "fmin z21.h, p4/M, z21.h, z1.h\n"
- "fmin z22.h, p4/M, z22.h, z1.h\n"
- "fmin z23.h, p4/M, z23.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
- "fmax z16.h, p4/M, z16.h, z0.h\n"
- "fmax z17.h, p4/M, z17.h, z0.h\n"
- "fmax z18.h, p4/M, z18.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z0.h\n"
- "fmax z20.h, p4/M, z20.h, z0.h\n"
- "fmax z21.h, p4/M, z21.h, z0.h\n"
- "fmax z22.h, p4/M, z22.h, z0.h\n"
- "fmax z23.h, p4/M, z23.h, z0.h\n"
+ "ld1rh { z24.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z25.h\n"
+ "fmin z9.h, p4/M, z9.h, z25.h\n"
+ "fmin z10.h, p4/M, z10.h, z25.h\n"
+ "fmin z11.h, p4/M, z11.h, z25.h\n"
+ "fmin z12.h, p4/M, z12.h, z25.h\n"
+ "fmin z13.h, p4/M, z13.h, z25.h\n"
+ "fmin z14.h, p4/M, z14.h, z25.h\n"
+ "fmin z15.h, p4/M, z15.h, z25.h\n"
+ "fmin z16.h, p4/M, z16.h, z25.h\n"
+ "fmin z17.h, p4/M, z17.h, z25.h\n"
+ "fmin z18.h, p4/M, z18.h, z25.h\n"
+ "fmin z19.h, p4/M, z19.h, z25.h\n"
+ "fmin z20.h, p4/M, z20.h, z25.h\n"
+ "fmin z21.h, p4/M, z21.h, z25.h\n"
+ "fmin z22.h, p4/M, z22.h, z25.h\n"
+ "fmin z23.h, p4/M, z23.h, z25.h\n"
+ "fmax z8.h, p4/M, z8.h, z24.h\n"
+ "fmax z9.h, p4/M, z9.h, z24.h\n"
+ "fmax z10.h, p4/M, z10.h, z24.h\n"
+ "fmax z11.h, p4/M, z11.h, z24.h\n"
+ "fmax z12.h, p4/M, z12.h, z24.h\n"
+ "fmax z13.h, p4/M, z13.h, z24.h\n"
+ "fmax z14.h, p4/M, z14.h, z24.h\n"
+ "fmax z15.h, p4/M, z15.h, z24.h\n"
+ "fmax z16.h, p4/M, z16.h, z24.h\n"
+ "fmax z17.h, p4/M, z17.h, z24.h\n"
+ "fmax z18.h, p4/M, z18.h, z24.h\n"
+ "fmax z19.h, p4/M, z19.h, z24.h\n"
+ "fmax z20.h, p4/M, z20.h, z24.h\n"
+ "fmax z21.h, p4/M, z21.h, z24.h\n"
+ "fmax z22.h, p4/M, z22.h, z24.h\n"
+ "fmax z23.h, p4/M, z23.h, z24.h\n"
"51:" // Height 4: No activation
"st1h { z8.h }, p3, [x13]\n"
"st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -939,30 +939,30 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"56:" // Height 5: no bias
"tbz %x[flags], #0, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p3/Z, [x13]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x23, x13, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "ld1h { z8.h }, p3/Z, [x13]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p3/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p3/Z, [x22]\n"
- "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x23]\n"
+ "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x22]\n"
+ "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x21]\n"
+ "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z24.h }, p3/Z, [x20]\n"
+ "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 58f\n"
"57:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -990,15 +990,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"59:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 60f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1009,10 +1009,10 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 61f\n"
"60:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"61:" // Height 5: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -1034,7 +1034,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z29.h }, p4/Z, [x10]\n"
"add x25, x25, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
@@ -1042,24 +1042,24 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x23, x23, #0x2\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z28.h }, p4/Z, [x9]\n"
"add x22, x22, #0x2\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z29.h, z0.h\n"
+ "fmla z14.h, p4/M, z29.h, z1.h\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
- "fmla z26.h, p4/M, z6.h, z4.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z18.h, p4/M, z29.h, z2.h\n"
+ "fmla z22.h, p4/M, z29.h, z3.h\n"
+ "fmla z26.h, p4/M, z29.h, z4.h\n"
+ "fmla z11.h, p4/M, z28.h, z0.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z6.h }, p4/Z, [x12]\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
+ "fmla z15.h, p4/M, z28.h, z1.h\n"
+ "fmla z19.h, p4/M, z28.h, z2.h\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
- "fmla z27.h, p4/M, z7.h, z4.h\n"
+ "fmla z23.h, p4/M, z28.h, z3.h\n"
+ "fmla z27.h, p4/M, z28.h, z4.h\n"
"ld1rh { z3.h }, p4/Z, [x23]\n"
"ld1rh { z4.h }, p4/Z, [x22]\n"
"ld1h { z7.h }, p4/Z, [x11]\n"
@@ -1075,25 +1075,25 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"addvl x12, x12, #1\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10]\n"
+ "ld1h { z29.h }, p4/Z, [x10]\n"
"addvl x11, x11, #1\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"addvl x10, x10, #1\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
- "ld1h { z7.h }, p4/Z, [x9]\n"
+ "ld1h { z28.h }, p4/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
- "fmla z26.h, p4/M, z6.h, z4.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
- "fmla z27.h, p4/M, z7.h, z4.h\n"
+ "fmla z10.h, p4/M, z29.h, z0.h\n"
+ "fmla z14.h, p4/M, z29.h, z1.h\n"
+ "fmla z18.h, p4/M, z29.h, z2.h\n"
+ "fmla z22.h, p4/M, z29.h, z3.h\n"
+ "fmla z26.h, p4/M, z29.h, z4.h\n"
+ "fmla z11.h, p4/M, z28.h, z0.h\n"
+ "fmla z15.h, p4/M, z28.h, z1.h\n"
+ "fmla z19.h, p4/M, z28.h, z2.h\n"
+ "fmla z23.h, p4/M, z28.h, z3.h\n"
+ "fmla z27.h, p4/M, z28.h, z4.h\n"
"bne 59b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x13, x20, LSL #1\n"
@@ -1102,49 +1102,49 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z29.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmin z16.h, p4/M, z16.h, z1.h\n"
- "fmin z17.h, p4/M, z17.h, z1.h\n"
- "fmin z18.h, p4/M, z18.h, z1.h\n"
- "fmin z19.h, p4/M, z19.h, z1.h\n"
- "fmin z20.h, p4/M, z20.h, z1.h\n"
- "fmin z21.h, p4/M, z21.h, z1.h\n"
- "fmin z22.h, p4/M, z22.h, z1.h\n"
- "fmin z23.h, p4/M, z23.h, z1.h\n"
- "fmin z24.h, p4/M, z24.h, z1.h\n"
- "fmin z25.h, p4/M, z25.h, z1.h\n"
- "fmin z26.h, p4/M, z26.h, z1.h\n"
- "fmin z27.h, p4/M, z27.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
- "fmax z16.h, p4/M, z16.h, z0.h\n"
- "fmax z17.h, p4/M, z17.h, z0.h\n"
- "fmax z18.h, p4/M, z18.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z0.h\n"
- "fmax z20.h, p4/M, z20.h, z0.h\n"
- "fmax z21.h, p4/M, z21.h, z0.h\n"
- "fmax z22.h, p4/M, z22.h, z0.h\n"
- "fmax z23.h, p4/M, z23.h, z0.h\n"
- "fmax z24.h, p4/M, z24.h, z0.h\n"
- "fmax z25.h, p4/M, z25.h, z0.h\n"
- "fmax z26.h, p4/M, z26.h, z0.h\n"
- "fmax z27.h, p4/M, z27.h, z0.h\n"
+ "ld1rh { z28.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z29.h\n"
+ "fmin z9.h, p4/M, z9.h, z29.h\n"
+ "fmin z10.h, p4/M, z10.h, z29.h\n"
+ "fmin z11.h, p4/M, z11.h, z29.h\n"
+ "fmin z12.h, p4/M, z12.h, z29.h\n"
+ "fmin z13.h, p4/M, z13.h, z29.h\n"
+ "fmin z14.h, p4/M, z14.h, z29.h\n"
+ "fmin z15.h, p4/M, z15.h, z29.h\n"
+ "fmin z16.h, p4/M, z16.h, z29.h\n"
+ "fmin z17.h, p4/M, z17.h, z29.h\n"
+ "fmin z18.h, p4/M, z18.h, z29.h\n"
+ "fmin z19.h, p4/M, z19.h, z29.h\n"
+ "fmin z20.h, p4/M, z20.h, z29.h\n"
+ "fmin z21.h, p4/M, z21.h, z29.h\n"
+ "fmin z22.h, p4/M, z22.h, z29.h\n"
+ "fmin z23.h, p4/M, z23.h, z29.h\n"
+ "fmin z24.h, p4/M, z24.h, z29.h\n"
+ "fmin z25.h, p4/M, z25.h, z29.h\n"
+ "fmin z26.h, p4/M, z26.h, z29.h\n"
+ "fmin z27.h, p4/M, z27.h, z29.h\n"
+ "fmax z8.h, p4/M, z8.h, z28.h\n"
+ "fmax z9.h, p4/M, z9.h, z28.h\n"
+ "fmax z10.h, p4/M, z10.h, z28.h\n"
+ "fmax z11.h, p4/M, z11.h, z28.h\n"
+ "fmax z12.h, p4/M, z12.h, z28.h\n"
+ "fmax z13.h, p4/M, z13.h, z28.h\n"
+ "fmax z14.h, p4/M, z14.h, z28.h\n"
+ "fmax z15.h, p4/M, z15.h, z28.h\n"
+ "fmax z16.h, p4/M, z16.h, z28.h\n"
+ "fmax z17.h, p4/M, z17.h, z28.h\n"
+ "fmax z18.h, p4/M, z18.h, z28.h\n"
+ "fmax z19.h, p4/M, z19.h, z28.h\n"
+ "fmax z20.h, p4/M, z20.h, z28.h\n"
+ "fmax z21.h, p4/M, z21.h, z28.h\n"
+ "fmax z22.h, p4/M, z22.h, z28.h\n"
+ "fmax z23.h, p4/M, z23.h, z28.h\n"
+ "fmax z24.h, p4/M, z24.h, z28.h\n"
+ "fmax z25.h, p4/M, z25.h, z28.h\n"
+ "fmax z26.h, p4/M, z26.h, z28.h\n"
+ "fmax z27.h, p4/M, z27.h, z28.h\n"
"64:" // Height 5: No activation
"st1h { z8.h }, p3, [x13]\n"
"st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -1240,35 +1240,35 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"69:" // Height 6: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p3/Z, [x13]\n"
+ "add x24, x13, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "ld1h { z8.h }, p3/Z, [x13]\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p3/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p3/Z, [x22]\n"
- "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1h { z28.h }, p3/Z, [x21]\n"
- "ld1h { z29.h }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1h { z30.h }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1h { z31.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x24]\n"
+ "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x23]\n"
+ "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x22]\n"
+ "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z24.h }, p3/Z, [x21]\n"
+ "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z28.h }, p3/Z, [x20]\n"
+ "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 71f\n"
"70:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1300,16 +1300,16 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"72:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 74f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1321,11 +1321,11 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
"b 74f\n"
"73:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"74:" // Height 6: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -1527,4 +1527,4 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
index 0b543b667f..842db1a4fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
@@ -163,11 +163,11 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 9f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -180,72 +180,72 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ "fmla z10.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[7]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
"sub x27, x27, #0x8\n"
- "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
"cmp x27, #0x8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z10.h, z17.h, z0.h[7]\n"
+ "fmla z11.h, z16.h, z0.h[7]\n"
"add x26, x26, #0x10\n"
"addvl x12, x12, #8\n"
"addvl x11, x11, #8\n"
@@ -255,112 +255,112 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z17.h, z0.h[0]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[1]\n"
+ "fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z10.h, z17.h, z0.h[1]\n"
+ "fmla z11.h, z16.h, z0.h[1]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[2]\n"
+ "fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z10.h, z17.h, z0.h[2]\n"
+ "fmla z11.h, z16.h, z0.h[2]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[3]\n"
+ "fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z10.h, z17.h, z0.h[3]\n"
+ "fmla z11.h, z16.h, z0.h[3]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[4]\n"
+ "fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z10.h, z17.h, z0.h[4]\n"
+ "fmla z11.h, z16.h, z0.h[4]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[5]\n"
+ "fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z10.h, z17.h, z0.h[5]\n"
+ "fmla z11.h, z16.h, z0.h[5]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[6]\n"
+ "fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z10.h, z17.h, z0.h[6]\n"
+ "fmla z11.h, z16.h, z0.h[6]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[7]\n"
+ "fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z17.h, z0.h[7]\n"
+ "fmla z11.h, z16.h, z0.h[7]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
@@ -372,17 +372,17 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"bne 7b\n"
"tbz %x[flags], #1, 13f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
+ "ld1rh { z16.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z17.h\n"
+ "fmin z9.h, p5/M, z9.h, z17.h\n"
+ "fmin z10.h, p5/M, z10.h, z17.h\n"
+ "fmin z11.h, p5/M, z11.h, z17.h\n"
+ "fmax z8.h, p5/M, z8.h, z16.h\n"
+ "fmax z9.h, p5/M, z9.h, z16.h\n"
+ "fmax z10.h, p5/M, z10.h, z16.h\n"
+ "fmax z11.h, p5/M, z11.h, z16.h\n"
"13:" // Height 1: No activation
"st1h { z8.h }, p4, [x13]\n"
"st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -443,15 +443,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
+ "add x20, x13, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x20]\n"
+ "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 20f\n"
"19:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -467,12 +467,12 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -480,263 +480,263 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 23f\n"
"22:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"23:" // Height 2: input setup done
"cmp x27, #0x8\n"
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z1.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z1.h[0]\n"
+ "fmla z12.h, z17.h, z0.h[0]\n"
+ "fmla z9.h, z16.h, z1.h[0]\n"
+ "fmla z13.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z17.h, z1.h[0]\n"
+ "fmla z14.h, z17.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
"cmp x27, #0x8\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[0]\n"
+ "fmla z15.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
"add x26, x26, #0x10\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[1]\n"
+ "fmla z12.h, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[1]\n"
+ "fmla z13.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[1]\n"
+ "fmla z14.h, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[1]\n"
+ "fmla z15.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[2]\n"
+ "fmla z12.h, z17.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[2]\n"
+ "fmla z13.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[2]\n"
+ "fmla z14.h, z17.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[2]\n"
+ "fmla z15.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[3]\n"
+ "fmla z12.h, z17.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[3]\n"
+ "fmla z13.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[3]\n"
+ "fmla z14.h, z17.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[3]\n"
+ "fmla z15.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[4]\n"
+ "fmla z12.h, z17.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[4]\n"
+ "fmla z13.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[4]\n"
+ "fmla z14.h, z17.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[4]\n"
+ "fmla z15.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[5]\n"
+ "fmla z12.h, z17.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[5]\n"
+ "fmla z13.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[5]\n"
+ "fmla z14.h, z17.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[5]\n"
+ "fmla z15.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[6]\n"
+ "fmla z12.h, z17.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[6]\n"
+ "fmla z13.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[6]\n"
+ "fmla z14.h, z17.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[6]\n"
+ "fmla z15.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #8\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[7]\n"
+ "fmla z12.h, z17.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[7]\n"
+ "fmla z13.h, z16.h, z0.h[7]\n"
+ "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z10.h, z17.h, z1.h[7]\n"
+ "fmla z14.h, z17.h, z0.h[7]\n"
+ "fmla z11.h, z16.h, z1.h[7]\n"
+ "fmla z15.h, z16.h, z0.h[7]\n"
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[0]\n"
+ "fmla z12.h, z17.h, z1.h[0]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "fmla z13.h, z16.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z17.h, z0.h[0]\n"
+ "fmla z14.h, z17.h, z1.h[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
+ "fmla z15.h, z16.h, z1.h[0]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[1]\n"
+ "fmla z12.h, z17.h, z1.h[1]\n"
+ "fmla z9.h, z16.h, z0.h[1]\n"
+ "fmla z13.h, z16.h, z1.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z10.h, z17.h, z0.h[1]\n"
+ "fmla z14.h, z17.h, z1.h[1]\n"
"addvl x12, x12, #1\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z11.h, z16.h, z0.h[1]\n"
+ "fmla z15.h, z16.h, z1.h[1]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[2]\n"
+ "fmla z12.h, z17.h, z1.h[2]\n"
+ "fmla z9.h, z16.h, z0.h[2]\n"
+ "fmla z13.h, z16.h, z1.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z10.h, z17.h, z0.h[2]\n"
+ "fmla z14.h, z17.h, z1.h[2]\n"
"addvl x12, x12, #1\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z11.h, z16.h, z0.h[2]\n"
+ "fmla z15.h, z16.h, z1.h[2]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[3]\n"
+ "fmla z12.h, z17.h, z1.h[3]\n"
+ "fmla z9.h, z16.h, z0.h[3]\n"
+ "fmla z13.h, z16.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z10.h, z17.h, z0.h[3]\n"
+ "fmla z14.h, z17.h, z1.h[3]\n"
"addvl x12, x12, #1\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z11.h, z16.h, z0.h[3]\n"
+ "fmla z15.h, z16.h, z1.h[3]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[4]\n"
+ "fmla z12.h, z17.h, z1.h[4]\n"
+ "fmla z9.h, z16.h, z0.h[4]\n"
+ "fmla z13.h, z16.h, z1.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z10.h, z17.h, z0.h[4]\n"
+ "fmla z14.h, z17.h, z1.h[4]\n"
"addvl x12, x12, #1\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z11.h, z16.h, z0.h[4]\n"
+ "fmla z15.h, z16.h, z1.h[4]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[5]\n"
+ "fmla z12.h, z17.h, z1.h[5]\n"
+ "fmla z9.h, z16.h, z0.h[5]\n"
+ "fmla z13.h, z16.h, z1.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z10.h, z17.h, z0.h[5]\n"
+ "fmla z14.h, z17.h, z1.h[5]\n"
"addvl x12, x12, #1\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z11.h, z16.h, z0.h[5]\n"
+ "fmla z15.h, z16.h, z1.h[5]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[6]\n"
+ "fmla z12.h, z17.h, z1.h[6]\n"
+ "fmla z9.h, z16.h, z0.h[6]\n"
+ "fmla z13.h, z16.h, z1.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z10.h, z17.h, z0.h[6]\n"
+ "fmla z14.h, z17.h, z1.h[6]\n"
"addvl x12, x12, #1\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z11.h, z16.h, z0.h[6]\n"
+ "fmla z15.h, z16.h, z1.h[6]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x12]\n"
+ "ld1h { z16.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z17.h, z0.h[7]\n"
+ "fmla z12.h, z17.h, z1.h[7]\n"
+ "fmla z9.h, z16.h, z0.h[7]\n"
+ "fmla z13.h, z16.h, z1.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z17.h, z0.h[7]\n"
+ "fmla z14.h, z17.h, z1.h[7]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z11.h, z16.h, z0.h[7]\n"
+ "fmla z15.h, z16.h, z1.h[7]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"26:" // Height 2: Multiply loop: multiply skip
@@ -748,25 +748,25 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"add x25, x13, x20, LSL #1\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
+ "ld1rh { z16.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z17.h\n"
+ "fmin z9.h, p5/M, z9.h, z17.h\n"
+ "fmin z10.h, p5/M, z10.h, z17.h\n"
+ "fmin z11.h, p5/M, z11.h, z17.h\n"
+ "fmin z12.h, p5/M, z12.h, z17.h\n"
+ "fmin z13.h, p5/M, z13.h, z17.h\n"
+ "fmin z14.h, p5/M, z14.h, z17.h\n"
+ "fmin z15.h, p5/M, z15.h, z17.h\n"
+ "fmax z8.h, p5/M, z8.h, z16.h\n"
+ "fmax z9.h, p5/M, z9.h, z16.h\n"
+ "fmax z10.h, p5/M, z10.h, z16.h\n"
+ "fmax z11.h, p5/M, z11.h, z16.h\n"
+ "fmax z12.h, p5/M, z12.h, z16.h\n"
+ "fmax z13.h, p5/M, z13.h, z16.h\n"
+ "fmax z14.h, p5/M, z14.h, z16.h\n"
+ "fmax z15.h, p5/M, z15.h, z16.h\n"
"27:" // Height 2: No activation
"st1h { z8.h }, p4, [x13]\n"
"st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -835,20 +835,20 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x21, x13, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x21]\n"
+ "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x20]\n"
+ "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 34f\n"
"33:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -868,13 +868,13 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 37f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -883,153 +883,153 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 37f\n"
"36:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"37:" // Height 3: input setup done
"cmp x27, #0x8\n"
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1rqh { z0.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z21.h, z2.h[0]\n"
+ "fmla z12.h, z21.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z16.h, z21.h, z0.h[0]\n"
+ "fmla z9.h, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
+ "fmla z17.h, z20.h, z0.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"cmp x27, #0x8\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z10.h, z21.h, z2.h[0]\n"
+ "fmla z14.h, z21.h, z1.h[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z18.h, z21.h, z0.h[0]\n"
+ "fmla z11.h, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #1, MUL VL]\n"
"add x24, x24, #0x10\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[0]\n"
+ "fmla z19.h, z20.h, z0.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[1]\n"
+ "fmla z12.h, z21.h, z1.h[1]\n"
+ "fmla z16.h, z21.h, z0.h[1]\n"
+ "fmla z9.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[1]\n"
+ "fmla z17.h, z20.h, z0.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[1]\n"
+ "fmla z14.h, z21.h, z1.h[1]\n"
+ "fmla z18.h, z21.h, z0.h[1]\n"
+ "fmla z11.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[1]\n"
+ "fmla z19.h, z20.h, z0.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[2]\n"
+ "fmla z12.h, z21.h, z1.h[2]\n"
+ "fmla z16.h, z21.h, z0.h[2]\n"
+ "fmla z9.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[2]\n"
+ "fmla z17.h, z20.h, z0.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[2]\n"
+ "fmla z14.h, z21.h, z1.h[2]\n"
+ "fmla z18.h, z21.h, z0.h[2]\n"
+ "fmla z11.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[2]\n"
+ "fmla z19.h, z20.h, z0.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[3]\n"
+ "fmla z12.h, z21.h, z1.h[3]\n"
+ "fmla z16.h, z21.h, z0.h[3]\n"
+ "fmla z9.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[3]\n"
+ "fmla z17.h, z20.h, z0.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[3]\n"
+ "fmla z14.h, z21.h, z1.h[3]\n"
+ "fmla z18.h, z21.h, z0.h[3]\n"
+ "fmla z11.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[3]\n"
+ "fmla z19.h, z20.h, z0.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #4, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[4]\n"
+ "fmla z12.h, z21.h, z1.h[4]\n"
+ "fmla z16.h, z21.h, z0.h[4]\n"
+ "fmla z9.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[4]\n"
+ "fmla z17.h, z20.h, z0.h[4]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #4, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[4]\n"
+ "fmla z14.h, z21.h, z1.h[4]\n"
+ "fmla z18.h, z21.h, z0.h[4]\n"
+ "fmla z11.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[4]\n"
+ "fmla z19.h, z20.h, z0.h[4]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #5, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[5]\n"
+ "fmla z12.h, z21.h, z1.h[5]\n"
+ "fmla z16.h, z21.h, z0.h[5]\n"
+ "fmla z9.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[5]\n"
+ "fmla z17.h, z20.h, z0.h[5]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #5, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[5]\n"
+ "fmla z14.h, z21.h, z1.h[5]\n"
+ "fmla z18.h, z21.h, z0.h[5]\n"
+ "fmla z11.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[5]\n"
+ "fmla z19.h, z20.h, z0.h[5]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #6, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[6]\n"
+ "fmla z12.h, z21.h, z1.h[6]\n"
+ "fmla z16.h, z21.h, z0.h[6]\n"
+ "fmla z9.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[6]\n"
+ "fmla z17.h, z20.h, z0.h[6]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #6, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[6]\n"
+ "fmla z14.h, z21.h, z1.h[6]\n"
+ "fmla z18.h, z21.h, z0.h[6]\n"
+ "fmla z11.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[6]\n"
+ "fmla z19.h, z20.h, z0.h[6]\n"
+ "ld1h { z20.h }, p5/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #8\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[7]\n"
+ "fmla z12.h, z21.h, z1.h[7]\n"
+ "fmla z16.h, z21.h, z0.h[7]\n"
+ "fmla z9.h, z20.h, z2.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[7]\n"
+ "fmla z17.h, z20.h, z0.h[7]\n"
+ "ld1h { z20.h }, p5/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z10.h, z21.h, z2.h[7]\n"
+ "fmla z14.h, z21.h, z1.h[7]\n"
+ "fmla z18.h, z21.h, z0.h[7]\n"
+ "fmla z11.h, z20.h, z2.h[7]\n"
+ "fmla z15.h, z20.h, z1.h[7]\n"
+ "fmla z19.h, z20.h, z0.h[7]\n"
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -1037,179 +1037,179 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z21.h, z0.h[0]\n"
+ "fmla z12.h, z21.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z16.h, z21.h, z2.h[0]\n"
+ "fmla z9.h, z20.h, z0.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
+ "fmla z17.h, z20.h, z2.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z10.h, z21.h, z0.h[0]\n"
+ "fmla z14.h, z21.h, z1.h[0]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z18.h, z21.h, z2.h[0]\n"
+ "fmla z11.h, z20.h, z0.h[0]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z15.h, z20.h, z1.h[0]\n"
+ "fmla z19.h, z20.h, z2.h[0]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[1]\n"
+ "fmla z12.h, z21.h, z1.h[1]\n"
+ "fmla z16.h, z21.h, z2.h[1]\n"
+ "fmla z9.h, z20.h, z0.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[1]\n"
+ "fmla z17.h, z20.h, z2.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z10.h, z21.h, z0.h[1]\n"
+ "fmla z14.h, z21.h, z1.h[1]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z18.h, z21.h, z2.h[1]\n"
+ "fmla z11.h, z20.h, z0.h[1]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z15.h, z20.h, z1.h[1]\n"
+ "fmla z19.h, z20.h, z2.h[1]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[2]\n"
+ "fmla z12.h, z21.h, z1.h[2]\n"
+ "fmla z16.h, z21.h, z2.h[2]\n"
+ "fmla z9.h, z20.h, z0.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[2]\n"
+ "fmla z17.h, z20.h, z2.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z10.h, z21.h, z0.h[2]\n"
+ "fmla z14.h, z21.h, z1.h[2]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z18.h, z21.h, z2.h[2]\n"
+ "fmla z11.h, z20.h, z0.h[2]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z15.h, z20.h, z1.h[2]\n"
+ "fmla z19.h, z20.h, z2.h[2]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[3]\n"
+ "fmla z12.h, z21.h, z1.h[3]\n"
+ "fmla z16.h, z21.h, z2.h[3]\n"
+ "fmla z9.h, z20.h, z0.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[3]\n"
+ "fmla z17.h, z20.h, z2.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z10.h, z21.h, z0.h[3]\n"
+ "fmla z14.h, z21.h, z1.h[3]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z18.h, z21.h, z2.h[3]\n"
+ "fmla z11.h, z20.h, z0.h[3]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z15.h, z20.h, z1.h[3]\n"
+ "fmla z19.h, z20.h, z2.h[3]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[4]\n"
+ "fmla z12.h, z21.h, z1.h[4]\n"
+ "fmla z16.h, z21.h, z2.h[4]\n"
+ "fmla z9.h, z20.h, z0.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[4]\n"
+ "fmla z17.h, z20.h, z2.h[4]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z10.h, z21.h, z0.h[4]\n"
+ "fmla z14.h, z21.h, z1.h[4]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z18.h, z21.h, z2.h[4]\n"
+ "fmla z11.h, z20.h, z0.h[4]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z15.h, z20.h, z1.h[4]\n"
+ "fmla z19.h, z20.h, z2.h[4]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[5]\n"
+ "fmla z12.h, z21.h, z1.h[5]\n"
+ "fmla z16.h, z21.h, z2.h[5]\n"
+ "fmla z9.h, z20.h, z0.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[5]\n"
+ "fmla z17.h, z20.h, z2.h[5]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z10.h, z21.h, z0.h[5]\n"
+ "fmla z14.h, z21.h, z1.h[5]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z18.h, z21.h, z2.h[5]\n"
+ "fmla z11.h, z20.h, z0.h[5]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z15.h, z20.h, z1.h[5]\n"
+ "fmla z19.h, z20.h, z2.h[5]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[6]\n"
+ "fmla z12.h, z21.h, z1.h[6]\n"
+ "fmla z16.h, z21.h, z2.h[6]\n"
+ "fmla z9.h, z20.h, z0.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[6]\n"
+ "fmla z17.h, z20.h, z2.h[6]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z10.h, z21.h, z0.h[6]\n"
+ "fmla z14.h, z21.h, z1.h[6]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z18.h, z21.h, z2.h[6]\n"
+ "fmla z11.h, z20.h, z0.h[6]\n"
"addvl x9, x9, #1\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z15.h, z20.h, z1.h[6]\n"
+ "fmla z19.h, z20.h, z2.h[6]\n"
"ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z21.h }, p5/Z, [x12]\n"
+ "ld1h { z20.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z21.h, z0.h[7]\n"
+ "fmla z12.h, z21.h, z1.h[7]\n"
+ "fmla z16.h, z21.h, z2.h[7]\n"
+ "fmla z9.h, z20.h, z0.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
"addvl x12, x12, #1\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z20.h, z1.h[7]\n"
+ "fmla z17.h, z20.h, z2.h[7]\n"
+ "ld1h { z20.h }, p5/Z, [x9]\n"
"addvl x11, x11, #1\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z10.h, z21.h, z0.h[7]\n"
+ "fmla z14.h, z21.h, z1.h[7]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z18.h, z21.h, z2.h[7]\n"
+ "fmla z11.h, z20.h, z0.h[7]\n"
+ "fmla z15.h, z20.h, z1.h[7]\n"
+ "fmla z19.h, z20.h, z2.h[7]\n"
"40:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1220,33 +1220,33 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 41f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z21.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmin z16.h, p5/M, z16.h, z1.h\n"
- "fmin z17.h, p5/M, z17.h, z1.h\n"
- "fmin z18.h, p5/M, z18.h, z1.h\n"
- "fmin z19.h, p5/M, z19.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
- "fmax z16.h, p5/M, z16.h, z0.h\n"
- "fmax z17.h, p5/M, z17.h, z0.h\n"
- "fmax z18.h, p5/M, z18.h, z0.h\n"
- "fmax z19.h, p5/M, z19.h, z0.h\n"
+ "ld1rh { z20.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z21.h\n"
+ "fmin z9.h, p5/M, z9.h, z21.h\n"
+ "fmin z10.h, p5/M, z10.h, z21.h\n"
+ "fmin z11.h, p5/M, z11.h, z21.h\n"
+ "fmin z12.h, p5/M, z12.h, z21.h\n"
+ "fmin z13.h, p5/M, z13.h, z21.h\n"
+ "fmin z14.h, p5/M, z14.h, z21.h\n"
+ "fmin z15.h, p5/M, z15.h, z21.h\n"
+ "fmin z16.h, p5/M, z16.h, z21.h\n"
+ "fmin z17.h, p5/M, z17.h, z21.h\n"
+ "fmin z18.h, p5/M, z18.h, z21.h\n"
+ "fmin z19.h, p5/M, z19.h, z21.h\n"
+ "fmax z8.h, p5/M, z8.h, z20.h\n"
+ "fmax z9.h, p5/M, z9.h, z20.h\n"
+ "fmax z10.h, p5/M, z10.h, z20.h\n"
+ "fmax z11.h, p5/M, z11.h, z20.h\n"
+ "fmax z12.h, p5/M, z12.h, z20.h\n"
+ "fmax z13.h, p5/M, z13.h, z20.h\n"
+ "fmax z14.h, p5/M, z14.h, z20.h\n"
+ "fmax z15.h, p5/M, z15.h, z20.h\n"
+ "fmax z16.h, p5/M, z16.h, z20.h\n"
+ "fmax z17.h, p5/M, z17.h, z20.h\n"
+ "fmax z18.h, p5/M, z18.h, z20.h\n"
+ "fmax z19.h, p5/M, z19.h, z20.h\n"
"41:" // Height 3: No activation
"st1h { z8.h }, p4, [x13]\n"
"st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -1323,25 +1323,25 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x22, x13, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x13]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23]\n"
- "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x22]\n"
+ "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x21]\n"
+ "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x20]\n"
+ "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 48f\n"
"47:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -1365,14 +1365,14 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1382,188 +1382,188 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 51f\n"
"50:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"51:" // Height 4: input setup done
"cmp x27, #0x8\n"
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z3.h }, p0/Z, [x26]\n"
+ "ld1rqh { z2.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "ld1rqh { z0.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z3.h[0]\n"
+ "fmla z12.h, z25.h, z2.h[0]\n"
+ "fmla z16.h, z25.h, z1.h[0]\n"
+ "fmla z20.h, z25.h, z0.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"add x25, x25, #0x10\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z9.h, z24.h, z3.h[0]\n"
+ "fmla z13.h, z24.h, z2.h[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z17.h, z24.h, z1.h[0]\n"
+ "fmla z21.h, z24.h, z0.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z25.h, z3.h[0]\n"
+ "fmla z14.h, z25.h, z2.h[0]\n"
+ "fmla z18.h, z25.h, z1.h[0]\n"
+ "fmla z22.h, z25.h, z0.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[0]\n"
+ "fmla z15.h, z24.h, z2.h[0]\n"
+ "fmla z19.h, z24.h, z1.h[0]\n"
+ "fmla z23.h, z24.h, z0.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[1]\n"
+ "fmla z12.h, z25.h, z2.h[1]\n"
+ "fmla z16.h, z25.h, z1.h[1]\n"
+ "fmla z20.h, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[1]\n"
+ "fmla z13.h, z24.h, z2.h[1]\n"
+ "fmla z17.h, z24.h, z1.h[1]\n"
+ "fmla z21.h, z24.h, z0.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[1]\n"
+ "fmla z14.h, z25.h, z2.h[1]\n"
+ "fmla z18.h, z25.h, z1.h[1]\n"
+ "fmla z22.h, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[1]\n"
+ "fmla z15.h, z24.h, z2.h[1]\n"
+ "fmla z19.h, z24.h, z1.h[1]\n"
+ "fmla z23.h, z24.h, z0.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[2]\n"
+ "fmla z12.h, z25.h, z2.h[2]\n"
+ "fmla z16.h, z25.h, z1.h[2]\n"
+ "fmla z20.h, z25.h, z0.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[2]\n"
+ "fmla z13.h, z24.h, z2.h[2]\n"
+ "fmla z17.h, z24.h, z1.h[2]\n"
+ "fmla z21.h, z24.h, z0.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[2]\n"
+ "fmla z14.h, z25.h, z2.h[2]\n"
+ "fmla z18.h, z25.h, z1.h[2]\n"
+ "fmla z22.h, z25.h, z0.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[2]\n"
+ "fmla z15.h, z24.h, z2.h[2]\n"
+ "fmla z19.h, z24.h, z1.h[2]\n"
+ "fmla z23.h, z24.h, z0.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[3]\n"
+ "fmla z12.h, z25.h, z2.h[3]\n"
+ "fmla z16.h, z25.h, z1.h[3]\n"
+ "fmla z20.h, z25.h, z0.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[3]\n"
+ "fmla z13.h, z24.h, z2.h[3]\n"
+ "fmla z17.h, z24.h, z1.h[3]\n"
+ "fmla z21.h, z24.h, z0.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[3]\n"
+ "fmla z14.h, z25.h, z2.h[3]\n"
+ "fmla z18.h, z25.h, z1.h[3]\n"
+ "fmla z22.h, z25.h, z0.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[3]\n"
+ "fmla z15.h, z24.h, z2.h[3]\n"
+ "fmla z19.h, z24.h, z1.h[3]\n"
+ "fmla z23.h, z24.h, z0.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #4, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[4]\n"
+ "fmla z12.h, z25.h, z2.h[4]\n"
+ "fmla z16.h, z25.h, z1.h[4]\n"
+ "fmla z20.h, z25.h, z0.h[4]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[4]\n"
+ "fmla z13.h, z24.h, z2.h[4]\n"
+ "fmla z17.h, z24.h, z1.h[4]\n"
+ "fmla z21.h, z24.h, z0.h[4]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #4, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[4]\n"
+ "fmla z14.h, z25.h, z2.h[4]\n"
+ "fmla z18.h, z25.h, z1.h[4]\n"
+ "fmla z22.h, z25.h, z0.h[4]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[4]\n"
+ "fmla z15.h, z24.h, z2.h[4]\n"
+ "fmla z19.h, z24.h, z1.h[4]\n"
+ "fmla z23.h, z24.h, z0.h[4]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #5, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[5]\n"
+ "fmla z12.h, z25.h, z2.h[5]\n"
+ "fmla z16.h, z25.h, z1.h[5]\n"
+ "fmla z20.h, z25.h, z0.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[5]\n"
+ "fmla z13.h, z24.h, z2.h[5]\n"
+ "fmla z17.h, z24.h, z1.h[5]\n"
+ "fmla z21.h, z24.h, z0.h[5]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #5, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[5]\n"
+ "fmla z14.h, z25.h, z2.h[5]\n"
+ "fmla z18.h, z25.h, z1.h[5]\n"
+ "fmla z22.h, z25.h, z0.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[5]\n"
+ "fmla z15.h, z24.h, z2.h[5]\n"
+ "fmla z19.h, z24.h, z1.h[5]\n"
+ "fmla z23.h, z24.h, z0.h[5]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #6, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[6]\n"
+ "fmla z12.h, z25.h, z2.h[6]\n"
+ "fmla z16.h, z25.h, z1.h[6]\n"
+ "fmla z20.h, z25.h, z0.h[6]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[6]\n"
+ "fmla z13.h, z24.h, z2.h[6]\n"
+ "fmla z17.h, z24.h, z1.h[6]\n"
+ "fmla z21.h, z24.h, z0.h[6]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #6, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[6]\n"
+ "fmla z14.h, z25.h, z2.h[6]\n"
+ "fmla z18.h, z25.h, z1.h[6]\n"
+ "fmla z22.h, z25.h, z0.h[6]\n"
+ "ld1h { z25.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[6]\n"
+ "fmla z15.h, z24.h, z2.h[6]\n"
+ "fmla z19.h, z24.h, z1.h[6]\n"
+ "fmla z23.h, z24.h, z0.h[6]\n"
+ "ld1h { z24.h }, p5/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #8\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[7]\n"
+ "fmla z12.h, z25.h, z2.h[7]\n"
+ "fmla z16.h, z25.h, z1.h[7]\n"
+ "fmla z20.h, z25.h, z0.h[7]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[7]\n"
+ "fmla z13.h, z24.h, z2.h[7]\n"
+ "fmla z17.h, z24.h, z1.h[7]\n"
+ "fmla z21.h, z24.h, z0.h[7]\n"
+ "ld1h { z24.h }, p5/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z10.h, z25.h, z3.h[7]\n"
+ "fmla z14.h, z25.h, z2.h[7]\n"
+ "fmla z18.h, z25.h, z1.h[7]\n"
+ "fmla z22.h, z25.h, z0.h[7]\n"
+ "fmla z11.h, z24.h, z3.h[7]\n"
+ "fmla z15.h, z24.h, z2.h[7]\n"
+ "fmla z19.h, z24.h, z1.h[7]\n"
+ "fmla z23.h, z24.h, z0.h[7]\n"
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -1572,211 +1572,211 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[0]\n"
+ "fmla z12.h, z25.h, z1.h[0]\n"
+ "fmla z16.h, z25.h, z2.h[0]\n"
+ "fmla z20.h, z25.h, z3.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"addvl x12, x12, #1\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z9.h, z24.h, z0.h[0]\n"
+ "fmla z13.h, z24.h, z1.h[0]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[0]\n"
+ "fmla z21.h, z24.h, z3.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z10.h, z25.h, z0.h[0]\n"
+ "fmla z14.h, z25.h, z1.h[0]\n"
+ "fmla z18.h, z25.h, z2.h[0]\n"
+ "fmla z22.h, z25.h, z3.h[0]\n"
+ "fmla z11.h, z24.h, z0.h[0]\n"
+ "fmla z15.h, z24.h, z1.h[0]\n"
+ "fmla z19.h, z24.h, z2.h[0]\n"
+ "fmla z23.h, z24.h, z3.h[0]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[1]\n"
+ "fmla z12.h, z25.h, z1.h[1]\n"
+ "fmla z16.h, z25.h, z2.h[1]\n"
+ "fmla z20.h, z25.h, z3.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z9.h, z24.h, z0.h[1]\n"
+ "fmla z13.h, z24.h, z1.h[1]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[1]\n"
+ "fmla z21.h, z24.h, z3.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z10.h, z25.h, z0.h[1]\n"
+ "fmla z14.h, z25.h, z1.h[1]\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z18.h, z25.h, z2.h[1]\n"
+ "fmla z22.h, z25.h, z3.h[1]\n"
+ "fmla z11.h, z24.h, z0.h[1]\n"
+ "fmla z15.h, z24.h, z1.h[1]\n"
+ "fmla z19.h, z24.h, z2.h[1]\n"
+ "fmla z23.h, z24.h, z3.h[1]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[2]\n"
+ "fmla z12.h, z25.h, z1.h[2]\n"
+ "fmla z16.h, z25.h, z2.h[2]\n"
+ "fmla z20.h, z25.h, z3.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z9.h, z24.h, z0.h[2]\n"
+ "fmla z13.h, z24.h, z1.h[2]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[2]\n"
+ "fmla z21.h, z24.h, z3.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z10.h, z25.h, z0.h[2]\n"
+ "fmla z14.h, z25.h, z1.h[2]\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z18.h, z25.h, z2.h[2]\n"
+ "fmla z22.h, z25.h, z3.h[2]\n"
+ "fmla z11.h, z24.h, z0.h[2]\n"
+ "fmla z15.h, z24.h, z1.h[2]\n"
+ "fmla z19.h, z24.h, z2.h[2]\n"
+ "fmla z23.h, z24.h, z3.h[2]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[3]\n"
+ "fmla z12.h, z25.h, z1.h[3]\n"
+ "fmla z16.h, z25.h, z2.h[3]\n"
+ "fmla z20.h, z25.h, z3.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z9.h, z24.h, z0.h[3]\n"
+ "fmla z13.h, z24.h, z1.h[3]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[3]\n"
+ "fmla z21.h, z24.h, z3.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z10.h, z25.h, z0.h[3]\n"
+ "fmla z14.h, z25.h, z1.h[3]\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z18.h, z25.h, z2.h[3]\n"
+ "fmla z22.h, z25.h, z3.h[3]\n"
+ "fmla z11.h, z24.h, z0.h[3]\n"
+ "fmla z15.h, z24.h, z1.h[3]\n"
+ "fmla z19.h, z24.h, z2.h[3]\n"
+ "fmla z23.h, z24.h, z3.h[3]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[4]\n"
+ "fmla z12.h, z25.h, z1.h[4]\n"
+ "fmla z16.h, z25.h, z2.h[4]\n"
+ "fmla z20.h, z25.h, z3.h[4]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z9.h, z24.h, z0.h[4]\n"
+ "fmla z13.h, z24.h, z1.h[4]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[4]\n"
+ "fmla z21.h, z24.h, z3.h[4]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z10.h, z25.h, z0.h[4]\n"
+ "fmla z14.h, z25.h, z1.h[4]\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z18.h, z25.h, z2.h[4]\n"
+ "fmla z22.h, z25.h, z3.h[4]\n"
+ "fmla z11.h, z24.h, z0.h[4]\n"
+ "fmla z15.h, z24.h, z1.h[4]\n"
+ "fmla z19.h, z24.h, z2.h[4]\n"
+ "fmla z23.h, z24.h, z3.h[4]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[5]\n"
+ "fmla z12.h, z25.h, z1.h[5]\n"
+ "fmla z16.h, z25.h, z2.h[5]\n"
+ "fmla z20.h, z25.h, z3.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z9.h, z24.h, z0.h[5]\n"
+ "fmla z13.h, z24.h, z1.h[5]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[5]\n"
+ "fmla z21.h, z24.h, z3.h[5]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z10.h, z25.h, z0.h[5]\n"
+ "fmla z14.h, z25.h, z1.h[5]\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z18.h, z25.h, z2.h[5]\n"
+ "fmla z22.h, z25.h, z3.h[5]\n"
+ "fmla z11.h, z24.h, z0.h[5]\n"
+ "fmla z15.h, z24.h, z1.h[5]\n"
+ "fmla z19.h, z24.h, z2.h[5]\n"
+ "fmla z23.h, z24.h, z3.h[5]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[6]\n"
+ "fmla z12.h, z25.h, z1.h[6]\n"
+ "fmla z16.h, z25.h, z2.h[6]\n"
+ "fmla z20.h, z25.h, z3.h[6]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z9.h, z24.h, z0.h[6]\n"
+ "fmla z13.h, z24.h, z1.h[6]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[6]\n"
+ "fmla z21.h, z24.h, z3.h[6]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z10.h, z25.h, z0.h[6]\n"
+ "fmla z14.h, z25.h, z1.h[6]\n"
"addvl x9, x9, #1\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z18.h, z25.h, z2.h[6]\n"
+ "fmla z22.h, z25.h, z3.h[6]\n"
+ "fmla z11.h, z24.h, z0.h[6]\n"
+ "fmla z15.h, z24.h, z1.h[6]\n"
+ "fmla z19.h, z24.h, z2.h[6]\n"
+ "fmla z23.h, z24.h, z3.h[6]\n"
"ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "ld1h { z25.h }, p5/Z, [x12]\n"
+ "ld1h { z24.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z25.h, z0.h[7]\n"
+ "fmla z12.h, z25.h, z1.h[7]\n"
+ "fmla z16.h, z25.h, z2.h[7]\n"
+ "fmla z20.h, z25.h, z3.h[7]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
"addvl x12, x12, #1\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z9.h, z24.h, z0.h[7]\n"
+ "fmla z13.h, z24.h, z1.h[7]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z24.h, z2.h[7]\n"
+ "fmla z21.h, z24.h, z3.h[7]\n"
+ "ld1h { z24.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z10.h, z25.h, z0.h[7]\n"
+ "fmla z14.h, z25.h, z1.h[7]\n"
+ "fmla z18.h, z25.h, z2.h[7]\n"
+ "fmla z22.h, z25.h, z3.h[7]\n"
+ "fmla z11.h, z24.h, z0.h[7]\n"
+ "fmla z15.h, z24.h, z1.h[7]\n"
+ "fmla z19.h, z24.h, z2.h[7]\n"
+ "fmla z23.h, z24.h, z3.h[7]\n"
"54:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1788,41 +1788,41 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 55f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z25.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmin z16.h, p5/M, z16.h, z1.h\n"
- "fmin z17.h, p5/M, z17.h, z1.h\n"
- "fmin z18.h, p5/M, z18.h, z1.h\n"
- "fmin z19.h, p5/M, z19.h, z1.h\n"
- "fmin z20.h, p5/M, z20.h, z1.h\n"
- "fmin z21.h, p5/M, z21.h, z1.h\n"
- "fmin z22.h, p5/M, z22.h, z1.h\n"
- "fmin z23.h, p5/M, z23.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
- "fmax z16.h, p5/M, z16.h, z0.h\n"
- "fmax z17.h, p5/M, z17.h, z0.h\n"
- "fmax z18.h, p5/M, z18.h, z0.h\n"
- "fmax z19.h, p5/M, z19.h, z0.h\n"
- "fmax z20.h, p5/M, z20.h, z0.h\n"
- "fmax z21.h, p5/M, z21.h, z0.h\n"
- "fmax z22.h, p5/M, z22.h, z0.h\n"
- "fmax z23.h, p5/M, z23.h, z0.h\n"
+ "ld1rh { z24.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z25.h\n"
+ "fmin z9.h, p5/M, z9.h, z25.h\n"
+ "fmin z10.h, p5/M, z10.h, z25.h\n"
+ "fmin z11.h, p5/M, z11.h, z25.h\n"
+ "fmin z12.h, p5/M, z12.h, z25.h\n"
+ "fmin z13.h, p5/M, z13.h, z25.h\n"
+ "fmin z14.h, p5/M, z14.h, z25.h\n"
+ "fmin z15.h, p5/M, z15.h, z25.h\n"
+ "fmin z16.h, p5/M, z16.h, z25.h\n"
+ "fmin z17.h, p5/M, z17.h, z25.h\n"
+ "fmin z18.h, p5/M, z18.h, z25.h\n"
+ "fmin z19.h, p5/M, z19.h, z25.h\n"
+ "fmin z20.h, p5/M, z20.h, z25.h\n"
+ "fmin z21.h, p5/M, z21.h, z25.h\n"
+ "fmin z22.h, p5/M, z22.h, z25.h\n"
+ "fmin z23.h, p5/M, z23.h, z25.h\n"
+ "fmax z8.h, p5/M, z8.h, z24.h\n"
+ "fmax z9.h, p5/M, z9.h, z24.h\n"
+ "fmax z10.h, p5/M, z10.h, z24.h\n"
+ "fmax z11.h, p5/M, z11.h, z24.h\n"
+ "fmax z12.h, p5/M, z12.h, z24.h\n"
+ "fmax z13.h, p5/M, z13.h, z24.h\n"
+ "fmax z14.h, p5/M, z14.h, z24.h\n"
+ "fmax z15.h, p5/M, z15.h, z24.h\n"
+ "fmax z16.h, p5/M, z16.h, z24.h\n"
+ "fmax z17.h, p5/M, z17.h, z24.h\n"
+ "fmax z18.h, p5/M, z18.h, z24.h\n"
+ "fmax z19.h, p5/M, z19.h, z24.h\n"
+ "fmax z20.h, p5/M, z20.h, z24.h\n"
+ "fmax z21.h, p5/M, z21.h, z24.h\n"
+ "fmax z22.h, p5/M, z22.h, z24.h\n"
+ "fmax z23.h, p5/M, z23.h, z24.h\n"
"55:" // Height 4: No activation
"st1h { z8.h }, p4, [x13]\n"
"st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -1907,30 +1907,30 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"60:" // Height 5: no bias
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x23, x13, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23]\n"
- "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p4/Z, [x22]\n"
- "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x23]\n"
+ "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x22]\n"
+ "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x21]\n"
+ "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x20]\n"
+ "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 62f\n"
"61:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1958,15 +1958,15 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"63:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 65f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1977,223 +1977,223 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 65f\n"
"64:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"65:" // Height 5: input setup done
"cmp x27, #0x8\n"
"ble 67f\n"
"66:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x26]\n"
+ "ld1rqh { z3.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1rqh { z0.h }, p0/Z, [x22]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z29.h, z4.h[0]\n"
+ "fmla z12.h, z29.h, z3.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z16.h, z29.h, z2.h[0]\n"
+ "fmla z20.h, z29.h, z1.h[0]\n"
"add x25, x25, #0x10\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z0.h[0]\n"
+ "fmla z9.h, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"add x24, x24, #0x10\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z13.h, z28.h, z3.h[0]\n"
+ "fmla z17.h, z28.h, z2.h[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z21.h, z28.h, z1.h[0]\n"
+ "fmla z25.h, z28.h, z0.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z29.h, z4.h[0]\n"
+ "fmla z14.h, z29.h, z3.h[0]\n"
+ "fmla z18.h, z29.h, z2.h[0]\n"
+ "fmla z22.h, z29.h, z1.h[0]\n"
+ "fmla z26.h, z29.h, z0.h[0]\n"
+ "fmla z11.h, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[0]\n"
+ "fmla z19.h, z28.h, z2.h[0]\n"
+ "fmla z23.h, z28.h, z1.h[0]\n"
+ "fmla z27.h, z28.h, z0.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[1]\n"
+ "fmla z12.h, z29.h, z3.h[1]\n"
+ "fmla z16.h, z29.h, z2.h[1]\n"
+ "fmla z20.h, z29.h, z1.h[1]\n"
+ "fmla z24.h, z29.h, z0.h[1]\n"
+ "fmla z9.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[1]\n"
+ "fmla z17.h, z28.h, z2.h[1]\n"
+ "fmla z21.h, z28.h, z1.h[1]\n"
+ "fmla z25.h, z28.h, z0.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[1]\n"
+ "fmla z14.h, z29.h, z3.h[1]\n"
+ "fmla z18.h, z29.h, z2.h[1]\n"
+ "fmla z22.h, z29.h, z1.h[1]\n"
+ "fmla z26.h, z29.h, z0.h[1]\n"
+ "fmla z11.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[1]\n"
+ "fmla z19.h, z28.h, z2.h[1]\n"
+ "fmla z23.h, z28.h, z1.h[1]\n"
+ "fmla z27.h, z28.h, z0.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[2]\n"
+ "fmla z12.h, z29.h, z3.h[2]\n"
+ "fmla z16.h, z29.h, z2.h[2]\n"
+ "fmla z20.h, z29.h, z1.h[2]\n"
+ "fmla z24.h, z29.h, z0.h[2]\n"
+ "fmla z9.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[2]\n"
+ "fmla z17.h, z28.h, z2.h[2]\n"
+ "fmla z21.h, z28.h, z1.h[2]\n"
+ "fmla z25.h, z28.h, z0.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[2]\n"
+ "fmla z14.h, z29.h, z3.h[2]\n"
+ "fmla z18.h, z29.h, z2.h[2]\n"
+ "fmla z22.h, z29.h, z1.h[2]\n"
+ "fmla z26.h, z29.h, z0.h[2]\n"
+ "fmla z11.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[2]\n"
+ "fmla z19.h, z28.h, z2.h[2]\n"
+ "fmla z23.h, z28.h, z1.h[2]\n"
+ "fmla z27.h, z28.h, z0.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[3]\n"
+ "fmla z12.h, z29.h, z3.h[3]\n"
+ "fmla z16.h, z29.h, z2.h[3]\n"
+ "fmla z20.h, z29.h, z1.h[3]\n"
+ "fmla z24.h, z29.h, z0.h[3]\n"
+ "fmla z9.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[3]\n"
+ "fmla z17.h, z28.h, z2.h[3]\n"
+ "fmla z21.h, z28.h, z1.h[3]\n"
+ "fmla z25.h, z28.h, z0.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[3]\n"
+ "fmla z14.h, z29.h, z3.h[3]\n"
+ "fmla z18.h, z29.h, z2.h[3]\n"
+ "fmla z22.h, z29.h, z1.h[3]\n"
+ "fmla z26.h, z29.h, z0.h[3]\n"
+ "fmla z11.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[3]\n"
+ "fmla z19.h, z28.h, z2.h[3]\n"
+ "fmla z23.h, z28.h, z1.h[3]\n"
+ "fmla z27.h, z28.h, z0.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #4, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[4]\n"
+ "fmla z12.h, z29.h, z3.h[4]\n"
+ "fmla z16.h, z29.h, z2.h[4]\n"
+ "fmla z20.h, z29.h, z1.h[4]\n"
+ "fmla z24.h, z29.h, z0.h[4]\n"
+ "fmla z9.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[4]\n"
+ "fmla z17.h, z28.h, z2.h[4]\n"
+ "fmla z21.h, z28.h, z1.h[4]\n"
+ "fmla z25.h, z28.h, z0.h[4]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #4, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[4]\n"
+ "fmla z14.h, z29.h, z3.h[4]\n"
+ "fmla z18.h, z29.h, z2.h[4]\n"
+ "fmla z22.h, z29.h, z1.h[4]\n"
+ "fmla z26.h, z29.h, z0.h[4]\n"
+ "fmla z11.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[4]\n"
+ "fmla z19.h, z28.h, z2.h[4]\n"
+ "fmla z23.h, z28.h, z1.h[4]\n"
+ "fmla z27.h, z28.h, z0.h[4]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #5, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[5]\n"
+ "fmla z12.h, z29.h, z3.h[5]\n"
+ "fmla z16.h, z29.h, z2.h[5]\n"
+ "fmla z20.h, z29.h, z1.h[5]\n"
+ "fmla z24.h, z29.h, z0.h[5]\n"
+ "fmla z9.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[5]\n"
+ "fmla z17.h, z28.h, z2.h[5]\n"
+ "fmla z21.h, z28.h, z1.h[5]\n"
+ "fmla z25.h, z28.h, z0.h[5]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #5, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[5]\n"
+ "fmla z14.h, z29.h, z3.h[5]\n"
+ "fmla z18.h, z29.h, z2.h[5]\n"
+ "fmla z22.h, z29.h, z1.h[5]\n"
+ "fmla z26.h, z29.h, z0.h[5]\n"
+ "fmla z11.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[5]\n"
+ "fmla z19.h, z28.h, z2.h[5]\n"
+ "fmla z23.h, z28.h, z1.h[5]\n"
+ "fmla z27.h, z28.h, z0.h[5]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #6, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[6]\n"
+ "fmla z12.h, z29.h, z3.h[6]\n"
+ "fmla z16.h, z29.h, z2.h[6]\n"
+ "fmla z20.h, z29.h, z1.h[6]\n"
+ "fmla z24.h, z29.h, z0.h[6]\n"
+ "fmla z9.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[6]\n"
+ "fmla z17.h, z28.h, z2.h[6]\n"
+ "fmla z21.h, z28.h, z1.h[6]\n"
+ "fmla z25.h, z28.h, z0.h[6]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #6, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[6]\n"
+ "fmla z14.h, z29.h, z3.h[6]\n"
+ "fmla z18.h, z29.h, z2.h[6]\n"
+ "fmla z22.h, z29.h, z1.h[6]\n"
+ "fmla z26.h, z29.h, z0.h[6]\n"
+ "fmla z11.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[6]\n"
+ "fmla z19.h, z28.h, z2.h[6]\n"
+ "fmla z23.h, z28.h, z1.h[6]\n"
+ "fmla z27.h, z28.h, z0.h[6]\n"
+ "ld1h { z28.h }, p5/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #8\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[7]\n"
+ "fmla z12.h, z29.h, z3.h[7]\n"
+ "fmla z16.h, z29.h, z2.h[7]\n"
+ "fmla z20.h, z29.h, z1.h[7]\n"
+ "fmla z24.h, z29.h, z0.h[7]\n"
+ "fmla z9.h, z28.h, z4.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[7]\n"
+ "fmla z17.h, z28.h, z2.h[7]\n"
+ "fmla z21.h, z28.h, z1.h[7]\n"
+ "fmla z25.h, z28.h, z0.h[7]\n"
+ "ld1h { z28.h }, p5/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z10.h, z29.h, z4.h[7]\n"
+ "fmla z14.h, z29.h, z3.h[7]\n"
+ "fmla z18.h, z29.h, z2.h[7]\n"
+ "fmla z22.h, z29.h, z1.h[7]\n"
+ "fmla z26.h, z29.h, z0.h[7]\n"
+ "fmla z11.h, z28.h, z4.h[7]\n"
+ "fmla z15.h, z28.h, z3.h[7]\n"
+ "fmla z19.h, z28.h, z2.h[7]\n"
+ "fmla z23.h, z28.h, z1.h[7]\n"
+ "fmla z27.h, z28.h, z0.h[7]\n"
"bgt 66b\n"
"67:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -2203,243 +2203,243 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "fmla z8.h, z29.h, z0.h[0]\n"
+ "fmla z12.h, z29.h, z1.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z16.h, z29.h, z2.h[0]\n"
+ "fmla z20.h, z29.h, z3.h[0]\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[0]\n"
+ "fmla z9.h, z28.h, z0.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z13.h, z28.h, z1.h[0]\n"
+ "fmla z17.h, z28.h, z2.h[0]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[0]\n"
+ "fmla z25.h, z28.h, z4.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
+ "fmla z10.h, z29.h, z0.h[0]\n"
+ "fmla z14.h, z29.h, z1.h[0]\n"
+ "fmla z18.h, z29.h, z2.h[0]\n"
+ "fmla z22.h, z29.h, z3.h[0]\n"
+ "fmla z26.h, z29.h, z4.h[0]\n"
+ "fmla z11.h, z28.h, z0.h[0]\n"
+ "fmla z15.h, z28.h, z1.h[0]\n"
+ "fmla z19.h, z28.h, z2.h[0]\n"
+ "fmla z23.h, z28.h, z3.h[0]\n"
+ "fmla z27.h, z28.h, z4.h[0]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[1]\n"
+ "fmla z12.h, z29.h, z1.h[1]\n"
+ "fmla z16.h, z29.h, z2.h[1]\n"
+ "fmla z20.h, z29.h, z3.h[1]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[1]\n"
+ "fmla z9.h, z28.h, z0.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z13.h, z28.h, z1.h[1]\n"
+ "fmla z17.h, z28.h, z2.h[1]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[1]\n"
+ "fmla z25.h, z28.h, z4.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
+ "fmla z10.h, z29.h, z0.h[1]\n"
+ "fmla z14.h, z29.h, z1.h[1]\n"
+ "fmla z18.h, z29.h, z2.h[1]\n"
+ "fmla z22.h, z29.h, z3.h[1]\n"
+ "fmla z26.h, z29.h, z4.h[1]\n"
+ "fmla z11.h, z28.h, z0.h[1]\n"
+ "fmla z15.h, z28.h, z1.h[1]\n"
+ "fmla z19.h, z28.h, z2.h[1]\n"
+ "fmla z23.h, z28.h, z3.h[1]\n"
+ "fmla z27.h, z28.h, z4.h[1]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[2]\n"
+ "fmla z12.h, z29.h, z1.h[2]\n"
+ "fmla z16.h, z29.h, z2.h[2]\n"
+ "fmla z20.h, z29.h, z3.h[2]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[2]\n"
+ "fmla z9.h, z28.h, z0.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z13.h, z28.h, z1.h[2]\n"
+ "fmla z17.h, z28.h, z2.h[2]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[2]\n"
+ "fmla z25.h, z28.h, z4.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
+ "fmla z10.h, z29.h, z0.h[2]\n"
+ "fmla z14.h, z29.h, z1.h[2]\n"
+ "fmla z18.h, z29.h, z2.h[2]\n"
+ "fmla z22.h, z29.h, z3.h[2]\n"
+ "fmla z26.h, z29.h, z4.h[2]\n"
+ "fmla z11.h, z28.h, z0.h[2]\n"
+ "fmla z15.h, z28.h, z1.h[2]\n"
+ "fmla z19.h, z28.h, z2.h[2]\n"
+ "fmla z23.h, z28.h, z3.h[2]\n"
+ "fmla z27.h, z28.h, z4.h[2]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[3]\n"
+ "fmla z12.h, z29.h, z1.h[3]\n"
+ "fmla z16.h, z29.h, z2.h[3]\n"
+ "fmla z20.h, z29.h, z3.h[3]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[3]\n"
+ "fmla z9.h, z28.h, z0.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z13.h, z28.h, z1.h[3]\n"
+ "fmla z17.h, z28.h, z2.h[3]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[3]\n"
+ "fmla z25.h, z28.h, z4.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
+ "fmla z10.h, z29.h, z0.h[3]\n"
+ "fmla z14.h, z29.h, z1.h[3]\n"
+ "fmla z18.h, z29.h, z2.h[3]\n"
+ "fmla z22.h, z29.h, z3.h[3]\n"
+ "fmla z26.h, z29.h, z4.h[3]\n"
+ "fmla z11.h, z28.h, z0.h[3]\n"
+ "fmla z15.h, z28.h, z1.h[3]\n"
+ "fmla z19.h, z28.h, z2.h[3]\n"
+ "fmla z23.h, z28.h, z3.h[3]\n"
+ "fmla z27.h, z28.h, z4.h[3]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[4]\n"
+ "fmla z12.h, z29.h, z1.h[4]\n"
+ "fmla z16.h, z29.h, z2.h[4]\n"
+ "fmla z20.h, z29.h, z3.h[4]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[4]\n"
+ "fmla z9.h, z28.h, z0.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z13.h, z28.h, z1.h[4]\n"
+ "fmla z17.h, z28.h, z2.h[4]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[4]\n"
+ "fmla z25.h, z28.h, z4.h[4]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
+ "fmla z10.h, z29.h, z0.h[4]\n"
+ "fmla z14.h, z29.h, z1.h[4]\n"
+ "fmla z18.h, z29.h, z2.h[4]\n"
+ "fmla z22.h, z29.h, z3.h[4]\n"
+ "fmla z26.h, z29.h, z4.h[4]\n"
+ "fmla z11.h, z28.h, z0.h[4]\n"
+ "fmla z15.h, z28.h, z1.h[4]\n"
+ "fmla z19.h, z28.h, z2.h[4]\n"
+ "fmla z23.h, z28.h, z3.h[4]\n"
+ "fmla z27.h, z28.h, z4.h[4]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[5]\n"
+ "fmla z12.h, z29.h, z1.h[5]\n"
+ "fmla z16.h, z29.h, z2.h[5]\n"
+ "fmla z20.h, z29.h, z3.h[5]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[5]\n"
+ "fmla z9.h, z28.h, z0.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z13.h, z28.h, z1.h[5]\n"
+ "fmla z17.h, z28.h, z2.h[5]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[5]\n"
+ "fmla z25.h, z28.h, z4.h[5]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
+ "fmla z10.h, z29.h, z0.h[5]\n"
+ "fmla z14.h, z29.h, z1.h[5]\n"
+ "fmla z18.h, z29.h, z2.h[5]\n"
+ "fmla z22.h, z29.h, z3.h[5]\n"
+ "fmla z26.h, z29.h, z4.h[5]\n"
+ "fmla z11.h, z28.h, z0.h[5]\n"
+ "fmla z15.h, z28.h, z1.h[5]\n"
+ "fmla z19.h, z28.h, z2.h[5]\n"
+ "fmla z23.h, z28.h, z3.h[5]\n"
+ "fmla z27.h, z28.h, z4.h[5]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[6]\n"
+ "fmla z12.h, z29.h, z1.h[6]\n"
+ "fmla z16.h, z29.h, z2.h[6]\n"
+ "fmla z20.h, z29.h, z3.h[6]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[6]\n"
+ "fmla z9.h, z28.h, z0.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z13.h, z28.h, z1.h[6]\n"
+ "fmla z17.h, z28.h, z2.h[6]\n"
"addvl x10, x10, #1\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z21.h, z28.h, z3.h[6]\n"
+ "fmla z25.h, z28.h, z4.h[6]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
+ "fmla z10.h, z29.h, z0.h[6]\n"
+ "fmla z14.h, z29.h, z1.h[6]\n"
+ "fmla z18.h, z29.h, z2.h[6]\n"
+ "fmla z22.h, z29.h, z3.h[6]\n"
+ "fmla z26.h, z29.h, z4.h[6]\n"
+ "fmla z11.h, z28.h, z0.h[6]\n"
+ "fmla z15.h, z28.h, z1.h[6]\n"
+ "fmla z19.h, z28.h, z2.h[6]\n"
+ "fmla z23.h, z28.h, z3.h[6]\n"
+ "fmla z27.h, z28.h, z4.h[6]\n"
"ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x12]\n"
+ "ld1h { z28.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z29.h, z0.h[7]\n"
+ "fmla z12.h, z29.h, z1.h[7]\n"
+ "fmla z16.h, z29.h, z2.h[7]\n"
+ "fmla z20.h, z29.h, z3.h[7]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z29.h, z4.h[7]\n"
+ "fmla z9.h, z28.h, z0.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z13.h, z28.h, z1.h[7]\n"
+ "fmla z17.h, z28.h, z2.h[7]\n"
+ "fmla z21.h, z28.h, z3.h[7]\n"
+ "fmla z25.h, z28.h, z4.h[7]\n"
+ "ld1h { z28.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z10.h, z29.h, z0.h[7]\n"
+ "fmla z14.h, z29.h, z1.h[7]\n"
+ "fmla z18.h, z29.h, z2.h[7]\n"
+ "fmla z22.h, z29.h, z3.h[7]\n"
+ "fmla z26.h, z29.h, z4.h[7]\n"
+ "fmla z11.h, z28.h, z0.h[7]\n"
+ "fmla z15.h, z28.h, z1.h[7]\n"
+ "fmla z19.h, z28.h, z2.h[7]\n"
+ "fmla z23.h, z28.h, z3.h[7]\n"
+ "fmla z27.h, z28.h, z4.h[7]\n"
"68:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2452,49 +2452,49 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 69f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z29.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmin z16.h, p5/M, z16.h, z1.h\n"
- "fmin z17.h, p5/M, z17.h, z1.h\n"
- "fmin z18.h, p5/M, z18.h, z1.h\n"
- "fmin z19.h, p5/M, z19.h, z1.h\n"
- "fmin z20.h, p5/M, z20.h, z1.h\n"
- "fmin z21.h, p5/M, z21.h, z1.h\n"
- "fmin z22.h, p5/M, z22.h, z1.h\n"
- "fmin z23.h, p5/M, z23.h, z1.h\n"
- "fmin z24.h, p5/M, z24.h, z1.h\n"
- "fmin z25.h, p5/M, z25.h, z1.h\n"
- "fmin z26.h, p5/M, z26.h, z1.h\n"
- "fmin z27.h, p5/M, z27.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
- "fmax z16.h, p5/M, z16.h, z0.h\n"
- "fmax z17.h, p5/M, z17.h, z0.h\n"
- "fmax z18.h, p5/M, z18.h, z0.h\n"
- "fmax z19.h, p5/M, z19.h, z0.h\n"
- "fmax z20.h, p5/M, z20.h, z0.h\n"
- "fmax z21.h, p5/M, z21.h, z0.h\n"
- "fmax z22.h, p5/M, z22.h, z0.h\n"
- "fmax z23.h, p5/M, z23.h, z0.h\n"
- "fmax z24.h, p5/M, z24.h, z0.h\n"
- "fmax z25.h, p5/M, z25.h, z0.h\n"
- "fmax z26.h, p5/M, z26.h, z0.h\n"
- "fmax z27.h, p5/M, z27.h, z0.h\n"
+ "ld1rh { z28.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z29.h\n"
+ "fmin z9.h, p5/M, z9.h, z29.h\n"
+ "fmin z10.h, p5/M, z10.h, z29.h\n"
+ "fmin z11.h, p5/M, z11.h, z29.h\n"
+ "fmin z12.h, p5/M, z12.h, z29.h\n"
+ "fmin z13.h, p5/M, z13.h, z29.h\n"
+ "fmin z14.h, p5/M, z14.h, z29.h\n"
+ "fmin z15.h, p5/M, z15.h, z29.h\n"
+ "fmin z16.h, p5/M, z16.h, z29.h\n"
+ "fmin z17.h, p5/M, z17.h, z29.h\n"
+ "fmin z18.h, p5/M, z18.h, z29.h\n"
+ "fmin z19.h, p5/M, z19.h, z29.h\n"
+ "fmin z20.h, p5/M, z20.h, z29.h\n"
+ "fmin z21.h, p5/M, z21.h, z29.h\n"
+ "fmin z22.h, p5/M, z22.h, z29.h\n"
+ "fmin z23.h, p5/M, z23.h, z29.h\n"
+ "fmin z24.h, p5/M, z24.h, z29.h\n"
+ "fmin z25.h, p5/M, z25.h, z29.h\n"
+ "fmin z26.h, p5/M, z26.h, z29.h\n"
+ "fmin z27.h, p5/M, z27.h, z29.h\n"
+ "fmax z8.h, p5/M, z8.h, z28.h\n"
+ "fmax z9.h, p5/M, z9.h, z28.h\n"
+ "fmax z10.h, p5/M, z10.h, z28.h\n"
+ "fmax z11.h, p5/M, z11.h, z28.h\n"
+ "fmax z12.h, p5/M, z12.h, z28.h\n"
+ "fmax z13.h, p5/M, z13.h, z28.h\n"
+ "fmax z14.h, p5/M, z14.h, z28.h\n"
+ "fmax z15.h, p5/M, z15.h, z28.h\n"
+ "fmax z16.h, p5/M, z16.h, z28.h\n"
+ "fmax z17.h, p5/M, z17.h, z28.h\n"
+ "fmax z18.h, p5/M, z18.h, z28.h\n"
+ "fmax z19.h, p5/M, z19.h, z28.h\n"
+ "fmax z20.h, p5/M, z20.h, z28.h\n"
+ "fmax z21.h, p5/M, z21.h, z28.h\n"
+ "fmax z22.h, p5/M, z22.h, z28.h\n"
+ "fmax z23.h, p5/M, z23.h, z28.h\n"
+ "fmax z24.h, p5/M, z24.h, z28.h\n"
+ "fmax z25.h, p5/M, z25.h, z28.h\n"
+ "fmax z26.h, p5/M, z26.h, z28.h\n"
+ "fmax z27.h, p5/M, z27.h, z28.h\n"
"69:" // Height 5: No activation
"st1h { z8.h }, p4, [x13]\n"
"st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -2590,35 +2590,35 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"74:" // Height 6: no bias
"tbz %x[flags], #0, 75f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
+ "add x24, x13, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23]\n"
- "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p4/Z, [x22]\n"
- "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1h { z28.h }, p4/Z, [x21]\n"
- "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x23]\n"
+ "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x22]\n"
+ "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x21]\n"
+ "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x20]\n"
+ "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 76f\n"
"75:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -2650,16 +2650,16 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"77:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 78f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 79f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2671,258 +2671,258 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"b 79f\n"
"78:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"79:" // Height 6: input setup done
"cmp x27, #0x8\n"
"ble 81f\n"
"80:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z6.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z4.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1rqh { z5.h }, p0/Z, [x21]\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "ld1rqh { z2.h }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x12]\n"
+ "ld1h { z0.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z1.h, z7.h[0]\n"
+ "fmla z12.h, z1.h, z6.h[0]\n"
+ "fmla z16.h, z1.h, z5.h[0]\n"
+ "fmla z20.h, z1.h, z4.h[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z28.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z1.h, z3.h[0]\n"
+ "fmla z28.h, z1.h, z2.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
"add x21, x21, #0x10\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "fmla z29.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z30.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
- "fmla z31.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z28.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "fmla z29.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z30.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
- "fmla z31.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z28.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "fmla z29.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z30.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
- "fmla z31.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z28.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "fmla z29.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z30.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
- "fmla z31.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z28.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "fmla z29.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z30.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
- "fmla z31.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z28.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "fmla z29.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z30.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
- "fmla z31.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z28.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "fmla z29.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z30.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[0]\n"
+ "fmla z13.h, z0.h, z6.h[0]\n"
+ "fmla z17.h, z0.h, z5.h[0]\n"
+ "fmla z21.h, z0.h, z4.h[0]\n"
+ "fmla z25.h, z0.h, z3.h[0]\n"
+ "fmla z29.h, z0.h, z2.h[0]\n"
+ "ld1h { z0.h }, p5/Z, [x9]\n"
+ "fmla z10.h, z1.h, z7.h[0]\n"
+ "fmla z14.h, z1.h, z6.h[0]\n"
+ "fmla z18.h, z1.h, z5.h[0]\n"
+ "fmla z22.h, z1.h, z4.h[0]\n"
+ "fmla z26.h, z1.h, z3.h[0]\n"
+ "fmla z30.h, z1.h, z2.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[0]\n"
+ "fmla z15.h, z0.h, z6.h[0]\n"
+ "fmla z19.h, z0.h, z5.h[0]\n"
+ "fmla z23.h, z0.h, z4.h[0]\n"
+ "fmla z27.h, z0.h, z3.h[0]\n"
+ "fmla z31.h, z0.h, z2.h[0]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[1]\n"
+ "fmla z12.h, z1.h, z6.h[1]\n"
+ "fmla z16.h, z1.h, z5.h[1]\n"
+ "fmla z20.h, z1.h, z4.h[1]\n"
+ "fmla z24.h, z1.h, z3.h[1]\n"
+ "fmla z28.h, z1.h, z2.h[1]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[1]\n"
+ "fmla z13.h, z0.h, z6.h[1]\n"
+ "fmla z17.h, z0.h, z5.h[1]\n"
+ "fmla z21.h, z0.h, z4.h[1]\n"
+ "fmla z25.h, z0.h, z3.h[1]\n"
+ "fmla z29.h, z0.h, z2.h[1]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[1]\n"
+ "fmla z14.h, z1.h, z6.h[1]\n"
+ "fmla z18.h, z1.h, z5.h[1]\n"
+ "fmla z22.h, z1.h, z4.h[1]\n"
+ "fmla z26.h, z1.h, z3.h[1]\n"
+ "fmla z30.h, z1.h, z2.h[1]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[1]\n"
+ "fmla z15.h, z0.h, z6.h[1]\n"
+ "fmla z19.h, z0.h, z5.h[1]\n"
+ "fmla z23.h, z0.h, z4.h[1]\n"
+ "fmla z27.h, z0.h, z3.h[1]\n"
+ "fmla z31.h, z0.h, z2.h[1]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[2]\n"
+ "fmla z12.h, z1.h, z6.h[2]\n"
+ "fmla z16.h, z1.h, z5.h[2]\n"
+ "fmla z20.h, z1.h, z4.h[2]\n"
+ "fmla z24.h, z1.h, z3.h[2]\n"
+ "fmla z28.h, z1.h, z2.h[2]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[2]\n"
+ "fmla z13.h, z0.h, z6.h[2]\n"
+ "fmla z17.h, z0.h, z5.h[2]\n"
+ "fmla z21.h, z0.h, z4.h[2]\n"
+ "fmla z25.h, z0.h, z3.h[2]\n"
+ "fmla z29.h, z0.h, z2.h[2]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[2]\n"
+ "fmla z14.h, z1.h, z6.h[2]\n"
+ "fmla z18.h, z1.h, z5.h[2]\n"
+ "fmla z22.h, z1.h, z4.h[2]\n"
+ "fmla z26.h, z1.h, z3.h[2]\n"
+ "fmla z30.h, z1.h, z2.h[2]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[2]\n"
+ "fmla z15.h, z0.h, z6.h[2]\n"
+ "fmla z19.h, z0.h, z5.h[2]\n"
+ "fmla z23.h, z0.h, z4.h[2]\n"
+ "fmla z27.h, z0.h, z3.h[2]\n"
+ "fmla z31.h, z0.h, z2.h[2]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[3]\n"
+ "fmla z12.h, z1.h, z6.h[3]\n"
+ "fmla z16.h, z1.h, z5.h[3]\n"
+ "fmla z20.h, z1.h, z4.h[3]\n"
+ "fmla z24.h, z1.h, z3.h[3]\n"
+ "fmla z28.h, z1.h, z2.h[3]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[3]\n"
+ "fmla z13.h, z0.h, z6.h[3]\n"
+ "fmla z17.h, z0.h, z5.h[3]\n"
+ "fmla z21.h, z0.h, z4.h[3]\n"
+ "fmla z25.h, z0.h, z3.h[3]\n"
+ "fmla z29.h, z0.h, z2.h[3]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[3]\n"
+ "fmla z14.h, z1.h, z6.h[3]\n"
+ "fmla z18.h, z1.h, z5.h[3]\n"
+ "fmla z22.h, z1.h, z4.h[3]\n"
+ "fmla z26.h, z1.h, z3.h[3]\n"
+ "fmla z30.h, z1.h, z2.h[3]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #4, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[3]\n"
+ "fmla z15.h, z0.h, z6.h[3]\n"
+ "fmla z19.h, z0.h, z5.h[3]\n"
+ "fmla z23.h, z0.h, z4.h[3]\n"
+ "fmla z27.h, z0.h, z3.h[3]\n"
+ "fmla z31.h, z0.h, z2.h[3]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #4, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[4]\n"
+ "fmla z12.h, z1.h, z6.h[4]\n"
+ "fmla z16.h, z1.h, z5.h[4]\n"
+ "fmla z20.h, z1.h, z4.h[4]\n"
+ "fmla z24.h, z1.h, z3.h[4]\n"
+ "fmla z28.h, z1.h, z2.h[4]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[4]\n"
+ "fmla z13.h, z0.h, z6.h[4]\n"
+ "fmla z17.h, z0.h, z5.h[4]\n"
+ "fmla z21.h, z0.h, z4.h[4]\n"
+ "fmla z25.h, z0.h, z3.h[4]\n"
+ "fmla z29.h, z0.h, z2.h[4]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #4, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[4]\n"
+ "fmla z14.h, z1.h, z6.h[4]\n"
+ "fmla z18.h, z1.h, z5.h[4]\n"
+ "fmla z22.h, z1.h, z4.h[4]\n"
+ "fmla z26.h, z1.h, z3.h[4]\n"
+ "fmla z30.h, z1.h, z2.h[4]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #5, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[4]\n"
+ "fmla z15.h, z0.h, z6.h[4]\n"
+ "fmla z19.h, z0.h, z5.h[4]\n"
+ "fmla z23.h, z0.h, z4.h[4]\n"
+ "fmla z27.h, z0.h, z3.h[4]\n"
+ "fmla z31.h, z0.h, z2.h[4]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #5, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[5]\n"
+ "fmla z12.h, z1.h, z6.h[5]\n"
+ "fmla z16.h, z1.h, z5.h[5]\n"
+ "fmla z20.h, z1.h, z4.h[5]\n"
+ "fmla z24.h, z1.h, z3.h[5]\n"
+ "fmla z28.h, z1.h, z2.h[5]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[5]\n"
+ "fmla z13.h, z0.h, z6.h[5]\n"
+ "fmla z17.h, z0.h, z5.h[5]\n"
+ "fmla z21.h, z0.h, z4.h[5]\n"
+ "fmla z25.h, z0.h, z3.h[5]\n"
+ "fmla z29.h, z0.h, z2.h[5]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #5, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[5]\n"
+ "fmla z14.h, z1.h, z6.h[5]\n"
+ "fmla z18.h, z1.h, z5.h[5]\n"
+ "fmla z22.h, z1.h, z4.h[5]\n"
+ "fmla z26.h, z1.h, z3.h[5]\n"
+ "fmla z30.h, z1.h, z2.h[5]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #6, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[5]\n"
+ "fmla z15.h, z0.h, z6.h[5]\n"
+ "fmla z19.h, z0.h, z5.h[5]\n"
+ "fmla z23.h, z0.h, z4.h[5]\n"
+ "fmla z27.h, z0.h, z3.h[5]\n"
+ "fmla z31.h, z0.h, z2.h[5]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #6, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[6]\n"
+ "fmla z12.h, z1.h, z6.h[6]\n"
+ "fmla z16.h, z1.h, z5.h[6]\n"
+ "fmla z20.h, z1.h, z4.h[6]\n"
+ "fmla z24.h, z1.h, z3.h[6]\n"
+ "fmla z28.h, z1.h, z2.h[6]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[6]\n"
+ "fmla z13.h, z0.h, z6.h[6]\n"
+ "fmla z17.h, z0.h, z5.h[6]\n"
+ "fmla z21.h, z0.h, z4.h[6]\n"
+ "fmla z25.h, z0.h, z3.h[6]\n"
+ "fmla z29.h, z0.h, z2.h[6]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #6, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[6]\n"
+ "fmla z14.h, z1.h, z6.h[6]\n"
+ "fmla z18.h, z1.h, z5.h[6]\n"
+ "fmla z22.h, z1.h, z4.h[6]\n"
+ "fmla z26.h, z1.h, z3.h[6]\n"
+ "fmla z30.h, z1.h, z2.h[6]\n"
+ "ld1h { z1.h }, p5/Z, [x12, #7, MUL VL]\n"
"addvl x12, x12, #8\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
- "fmla z31.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[6]\n"
+ "fmla z15.h, z0.h, z6.h[6]\n"
+ "fmla z19.h, z0.h, z5.h[6]\n"
+ "fmla z23.h, z0.h, z4.h[6]\n"
+ "fmla z27.h, z0.h, z3.h[6]\n"
+ "fmla z31.h, z0.h, z2.h[6]\n"
+ "ld1h { z0.h }, p5/Z, [x11, #7, MUL VL]\n"
"addvl x11, x11, #8\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z28.h, z6.h, z5.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[7]\n"
+ "fmla z12.h, z1.h, z6.h[7]\n"
+ "fmla z16.h, z1.h, z5.h[7]\n"
+ "fmla z20.h, z1.h, z4.h[7]\n"
+ "fmla z24.h, z1.h, z3.h[7]\n"
+ "fmla z28.h, z1.h, z2.h[7]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "fmla z29.h, z7.h, z5.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[7]\n"
+ "fmla z13.h, z0.h, z6.h[7]\n"
+ "fmla z17.h, z0.h, z5.h[7]\n"
+ "fmla z21.h, z0.h, z4.h[7]\n"
+ "fmla z25.h, z0.h, z3.h[7]\n"
+ "fmla z29.h, z0.h, z2.h[7]\n"
+ "ld1h { z0.h }, p5/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z30.h, z6.h, z5.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
- "fmla z31.h, z7.h, z5.h[7]\n"
+ "fmla z10.h, z1.h, z7.h[7]\n"
+ "fmla z14.h, z1.h, z6.h[7]\n"
+ "fmla z18.h, z1.h, z5.h[7]\n"
+ "fmla z22.h, z1.h, z4.h[7]\n"
+ "fmla z26.h, z1.h, z3.h[7]\n"
+ "fmla z30.h, z1.h, z2.h[7]\n"
+ "fmla z11.h, z0.h, z7.h[7]\n"
+ "fmla z15.h, z0.h, z6.h[7]\n"
+ "fmla z19.h, z0.h, z5.h[7]\n"
+ "fmla z23.h, z0.h, z4.h[7]\n"
+ "fmla z27.h, z0.h, z3.h[7]\n"
+ "fmla z31.h, z0.h, z2.h[7]\n"
"bgt 80b\n"
"81:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -2933,275 +2933,275 @@ void sve_ffhybrid_fp16_mla_6x4VL (
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
"ld1rqh { z5.h }, p0/Z, [x21]\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[0]\n"
+ "fmla z12.h, z7.h, z1.h[0]\n"
+ "fmla z16.h, z7.h, z2.h[0]\n"
+ "fmla z20.h, z7.h, z3.h[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z28.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[0]\n"
+ "fmla z28.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "fmla z29.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z9.h, z6.h, z0.h[0]\n"
+ "fmla z13.h, z6.h, z1.h[0]\n"
+ "fmla z17.h, z6.h, z2.h[0]\n"
+ "fmla z21.h, z6.h, z3.h[0]\n"
+ "fmla z25.h, z6.h, z4.h[0]\n"
+ "fmla z29.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z30.h, z6.h, z5.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
- "fmla z31.h, z7.h, z5.h[0]\n"
+ "fmla z10.h, z7.h, z0.h[0]\n"
+ "fmla z14.h, z7.h, z1.h[0]\n"
+ "fmla z18.h, z7.h, z2.h[0]\n"
+ "fmla z22.h, z7.h, z3.h[0]\n"
+ "fmla z26.h, z7.h, z4.h[0]\n"
+ "fmla z30.h, z7.h, z5.h[0]\n"
+ "fmla z11.h, z6.h, z0.h[0]\n"
+ "fmla z15.h, z6.h, z1.h[0]\n"
+ "fmla z19.h, z6.h, z2.h[0]\n"
+ "fmla z23.h, z6.h, z3.h[0]\n"
+ "fmla z27.h, z6.h, z4.h[0]\n"
+ "fmla z31.h, z6.h, z5.h[0]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[1]\n"
+ "fmla z12.h, z7.h, z1.h[1]\n"
+ "fmla z16.h, z7.h, z2.h[1]\n"
+ "fmla z20.h, z7.h, z3.h[1]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z28.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[1]\n"
+ "fmla z28.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z9.h, z6.h, z0.h[1]\n"
+ "fmla z13.h, z6.h, z1.h[1]\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "fmla z29.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z6.h, z2.h[1]\n"
+ "fmla z21.h, z6.h, z3.h[1]\n"
+ "fmla z25.h, z6.h, z4.h[1]\n"
+ "fmla z29.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z30.h, z6.h, z5.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
- "fmla z31.h, z7.h, z5.h[1]\n"
+ "fmla z10.h, z7.h, z0.h[1]\n"
+ "fmla z14.h, z7.h, z1.h[1]\n"
+ "fmla z18.h, z7.h, z2.h[1]\n"
+ "fmla z22.h, z7.h, z3.h[1]\n"
+ "fmla z26.h, z7.h, z4.h[1]\n"
+ "fmla z30.h, z7.h, z5.h[1]\n"
+ "fmla z11.h, z6.h, z0.h[1]\n"
+ "fmla z15.h, z6.h, z1.h[1]\n"
+ "fmla z19.h, z6.h, z2.h[1]\n"
+ "fmla z23.h, z6.h, z3.h[1]\n"
+ "fmla z27.h, z6.h, z4.h[1]\n"
+ "fmla z31.h, z6.h, z5.h[1]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[2]\n"
+ "fmla z12.h, z7.h, z1.h[2]\n"
+ "fmla z16.h, z7.h, z2.h[2]\n"
+ "fmla z20.h, z7.h, z3.h[2]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z28.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[2]\n"
+ "fmla z28.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z9.h, z6.h, z0.h[2]\n"
+ "fmla z13.h, z6.h, z1.h[2]\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "fmla z29.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z6.h, z2.h[2]\n"
+ "fmla z21.h, z6.h, z3.h[2]\n"
+ "fmla z25.h, z6.h, z4.h[2]\n"
+ "fmla z29.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z30.h, z6.h, z5.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
- "fmla z31.h, z7.h, z5.h[2]\n"
+ "fmla z10.h, z7.h, z0.h[2]\n"
+ "fmla z14.h, z7.h, z1.h[2]\n"
+ "fmla z18.h, z7.h, z2.h[2]\n"
+ "fmla z22.h, z7.h, z3.h[2]\n"
+ "fmla z26.h, z7.h, z4.h[2]\n"
+ "fmla z30.h, z7.h, z5.h[2]\n"
+ "fmla z11.h, z6.h, z0.h[2]\n"
+ "fmla z15.h, z6.h, z1.h[2]\n"
+ "fmla z19.h, z6.h, z2.h[2]\n"
+ "fmla z23.h, z6.h, z3.h[2]\n"
+ "fmla z27.h, z6.h, z4.h[2]\n"
+ "fmla z31.h, z6.h, z5.h[2]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[3]\n"
+ "fmla z12.h, z7.h, z1.h[3]\n"
+ "fmla z16.h, z7.h, z2.h[3]\n"
+ "fmla z20.h, z7.h, z3.h[3]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z28.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[3]\n"
+ "fmla z28.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z9.h, z6.h, z0.h[3]\n"
+ "fmla z13.h, z6.h, z1.h[3]\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "fmla z29.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z6.h, z2.h[3]\n"
+ "fmla z21.h, z6.h, z3.h[3]\n"
+ "fmla z25.h, z6.h, z4.h[3]\n"
+ "fmla z29.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z30.h, z6.h, z5.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
- "fmla z31.h, z7.h, z5.h[3]\n"
+ "fmla z10.h, z7.h, z0.h[3]\n"
+ "fmla z14.h, z7.h, z1.h[3]\n"
+ "fmla z18.h, z7.h, z2.h[3]\n"
+ "fmla z22.h, z7.h, z3.h[3]\n"
+ "fmla z26.h, z7.h, z4.h[3]\n"
+ "fmla z30.h, z7.h, z5.h[3]\n"
+ "fmla z11.h, z6.h, z0.h[3]\n"
+ "fmla z15.h, z6.h, z1.h[3]\n"
+ "fmla z19.h, z6.h, z2.h[3]\n"
+ "fmla z23.h, z6.h, z3.h[3]\n"
+ "fmla z27.h, z6.h, z4.h[3]\n"
+ "fmla z31.h, z6.h, z5.h[3]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[4]\n"
+ "fmla z12.h, z7.h, z1.h[4]\n"
+ "fmla z16.h, z7.h, z2.h[4]\n"
+ "fmla z20.h, z7.h, z3.h[4]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z28.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[4]\n"
+ "fmla z28.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z9.h, z6.h, z0.h[4]\n"
+ "fmla z13.h, z6.h, z1.h[4]\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "fmla z29.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z6.h, z2.h[4]\n"
+ "fmla z21.h, z6.h, z3.h[4]\n"
+ "fmla z25.h, z6.h, z4.h[4]\n"
+ "fmla z29.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z30.h, z6.h, z5.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
- "fmla z31.h, z7.h, z5.h[4]\n"
+ "fmla z10.h, z7.h, z0.h[4]\n"
+ "fmla z14.h, z7.h, z1.h[4]\n"
+ "fmla z18.h, z7.h, z2.h[4]\n"
+ "fmla z22.h, z7.h, z3.h[4]\n"
+ "fmla z26.h, z7.h, z4.h[4]\n"
+ "fmla z30.h, z7.h, z5.h[4]\n"
+ "fmla z11.h, z6.h, z0.h[4]\n"
+ "fmla z15.h, z6.h, z1.h[4]\n"
+ "fmla z19.h, z6.h, z2.h[4]\n"
+ "fmla z23.h, z6.h, z3.h[4]\n"
+ "fmla z27.h, z6.h, z4.h[4]\n"
+ "fmla z31.h, z6.h, z5.h[4]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[5]\n"
+ "fmla z12.h, z7.h, z1.h[5]\n"
+ "fmla z16.h, z7.h, z2.h[5]\n"
+ "fmla z20.h, z7.h, z3.h[5]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z28.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[5]\n"
+ "fmla z28.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z9.h, z6.h, z0.h[5]\n"
+ "fmla z13.h, z6.h, z1.h[5]\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "fmla z29.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z6.h, z2.h[5]\n"
+ "fmla z21.h, z6.h, z3.h[5]\n"
+ "fmla z25.h, z6.h, z4.h[5]\n"
+ "fmla z29.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z30.h, z6.h, z5.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
- "fmla z31.h, z7.h, z5.h[5]\n"
+ "fmla z10.h, z7.h, z0.h[5]\n"
+ "fmla z14.h, z7.h, z1.h[5]\n"
+ "fmla z18.h, z7.h, z2.h[5]\n"
+ "fmla z22.h, z7.h, z3.h[5]\n"
+ "fmla z26.h, z7.h, z4.h[5]\n"
+ "fmla z30.h, z7.h, z5.h[5]\n"
+ "fmla z11.h, z6.h, z0.h[5]\n"
+ "fmla z15.h, z6.h, z1.h[5]\n"
+ "fmla z19.h, z6.h, z2.h[5]\n"
+ "fmla z23.h, z6.h, z3.h[5]\n"
+ "fmla z27.h, z6.h, z4.h[5]\n"
+ "fmla z31.h, z6.h, z5.h[5]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[6]\n"
+ "fmla z12.h, z7.h, z1.h[6]\n"
+ "fmla z16.h, z7.h, z2.h[6]\n"
+ "fmla z20.h, z7.h, z3.h[6]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z28.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[6]\n"
+ "fmla z28.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z9.h, z6.h, z0.h[6]\n"
+ "fmla z13.h, z6.h, z1.h[6]\n"
"addvl x10, x10, #1\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "fmla z29.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z17.h, z6.h, z2.h[6]\n"
+ "fmla z21.h, z6.h, z3.h[6]\n"
+ "fmla z25.h, z6.h, z4.h[6]\n"
+ "fmla z29.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z30.h, z6.h, z5.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
- "fmla z31.h, z7.h, z5.h[6]\n"
+ "fmla z10.h, z7.h, z0.h[6]\n"
+ "fmla z14.h, z7.h, z1.h[6]\n"
+ "fmla z18.h, z7.h, z2.h[6]\n"
+ "fmla z22.h, z7.h, z3.h[6]\n"
+ "fmla z26.h, z7.h, z4.h[6]\n"
+ "fmla z30.h, z7.h, z5.h[6]\n"
+ "fmla z11.h, z6.h, z0.h[6]\n"
+ "fmla z15.h, z6.h, z1.h[6]\n"
+ "fmla z19.h, z6.h, z2.h[6]\n"
+ "fmla z23.h, z6.h, z3.h[6]\n"
+ "fmla z27.h, z6.h, z4.h[6]\n"
+ "fmla z31.h, z6.h, z5.h[6]\n"
"ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x12]\n"
- "ld1h { z7.h }, p5/Z, [x11]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x12]\n"
+ "ld1h { z6.h }, p5/Z, [x11]\n"
+ "fmla z8.h, z7.h, z0.h[7]\n"
+ "fmla z12.h, z7.h, z1.h[7]\n"
+ "fmla z16.h, z7.h, z2.h[7]\n"
+ "fmla z20.h, z7.h, z3.h[7]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z28.h, z6.h, z5.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
+ "fmla z24.h, z7.h, z4.h[7]\n"
+ "fmla z28.h, z7.h, z5.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "fmla z29.h, z7.h, z5.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x9]\n"
+ "fmla z9.h, z6.h, z0.h[7]\n"
+ "fmla z13.h, z6.h, z1.h[7]\n"
+ "fmla z17.h, z6.h, z2.h[7]\n"
+ "fmla z21.h, z6.h, z3.h[7]\n"
+ "fmla z25.h, z6.h, z4.h[7]\n"
+ "fmla z29.h, z6.h, z5.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z30.h, z6.h, z5.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
- "fmla z31.h, z7.h, z5.h[7]\n"
+ "fmla z10.h, z7.h, z0.h[7]\n"
+ "fmla z14.h, z7.h, z1.h[7]\n"
+ "fmla z18.h, z7.h, z2.h[7]\n"
+ "fmla z22.h, z7.h, z3.h[7]\n"
+ "fmla z26.h, z7.h, z4.h[7]\n"
+ "fmla z30.h, z7.h, z5.h[7]\n"
+ "fmla z11.h, z6.h, z0.h[7]\n"
+ "fmla z15.h, z6.h, z1.h[7]\n"
+ "fmla z19.h, z6.h, z2.h[7]\n"
+ "fmla z23.h, z6.h, z3.h[7]\n"
+ "fmla z27.h, z6.h, z4.h[7]\n"
+ "fmla z31.h, z6.h, z5.h[7]\n"
"82:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3315,4 +3315,4 @@ void sve_ffhybrid_fp16_mla_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
index b4c124c1e3..3a93a2f7c8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
index 32fcac3a45..8e4fd4388e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -163,11 +163,11 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 9f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -183,12 +183,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"10:" // Height 1: Multiply loop: Main loop
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p4/Z, [x10]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
"add x26, x26, #0x4\n"
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -201,12 +201,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p4/Z, [x10]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
@@ -214,17 +214,17 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"bne 7b\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z17.s\n"
+ "fmin z9.s, p4/M, z9.s, z17.s\n"
+ "fmin z10.s, p4/M, z10.s, z17.s\n"
+ "fmin z11.s, p4/M, z11.s, z17.s\n"
+ "fmax z8.s, p4/M, z8.s, z16.s\n"
+ "fmax z9.s, p4/M, z9.s, z16.s\n"
+ "fmax z10.s, p4/M, z10.s, z16.s\n"
+ "fmax z11.s, p4/M, z11.s, z16.s\n"
"12:" // Height 1: No activation
"st1w { z8.s }, p3, [x13]\n"
"st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -285,15 +285,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"17:" // Height 2: no bias
"tbz %x[flags], #0, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
+ "add x20, x13, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x20]\n"
+ "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 19f\n"
"18:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -309,12 +309,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"20:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 21f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 22f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -322,7 +322,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 22f\n"
"21:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"22:" // Height 2: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -333,19 +333,19 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"23:" // Height 2: Multiply loop: Main loop
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z17.s }, p4/Z, [x10]\n"
"addvl x12, x12, #1\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
"addvl x11, x11, #1\n"
"add x26, x26, #0x4\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z14.s, p4/M, z17.s, z1.s\n"
"add x25, x25, #0x4\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
+ "fmla z15.s, p4/M, z16.s, z1.s\n"
"addvl x10, x10, #1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
@@ -357,18 +357,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z17.s }, p4/Z, [x10]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z14.s, p4/M, z17.s, z1.s\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
+ "fmla z15.s, p4/M, z16.s, z1.s\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"bne 20b\n"
@@ -376,25 +376,25 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x25, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
+ "ld1rw { z16.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z17.s\n"
+ "fmin z9.s, p4/M, z9.s, z17.s\n"
+ "fmin z10.s, p4/M, z10.s, z17.s\n"
+ "fmin z11.s, p4/M, z11.s, z17.s\n"
+ "fmin z12.s, p4/M, z12.s, z17.s\n"
+ "fmin z13.s, p4/M, z13.s, z17.s\n"
+ "fmin z14.s, p4/M, z14.s, z17.s\n"
+ "fmin z15.s, p4/M, z15.s, z17.s\n"
+ "fmax z8.s, p4/M, z8.s, z16.s\n"
+ "fmax z9.s, p4/M, z9.s, z16.s\n"
+ "fmax z10.s, p4/M, z10.s, z16.s\n"
+ "fmax z11.s, p4/M, z11.s, z16.s\n"
+ "fmax z12.s, p4/M, z12.s, z16.s\n"
+ "fmax z13.s, p4/M, z13.s, z16.s\n"
+ "fmax z14.s, p4/M, z14.s, z16.s\n"
+ "fmax z15.s, p4/M, z15.s, z16.s\n"
"25:" // Height 2: No activation
"st1w { z8.s }, p3, [x13]\n"
"st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -463,20 +463,20 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"30:" // Height 3: no bias
"tbz %x[flags], #0, 31f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x21, x13, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x21]\n"
+ "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x20]\n"
+ "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 32f\n"
"31:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -496,13 +496,13 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"33:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 34f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 35f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -511,8 +511,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 35f\n"
"34:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"35:" // Height 3: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -528,22 +528,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"addvl x11, x11, #1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z21.s }, p4/Z, [x10]\n"
"add x26, x26, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z20.s }, p4/Z, [x9]\n"
"subs x27, x27, #0x1\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z21.s, z0.s\n"
+ "fmla z14.s, p4/M, z21.s, z1.s\n"
+ "fmla z18.s, p4/M, z21.s, z2.s\n"
+ "fmla z11.s, p4/M, z20.s, z0.s\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
+ "fmla z15.s, p4/M, z20.s, z1.s\n"
+ "fmla z19.s, p4/M, z20.s, z2.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
@@ -557,54 +557,54 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z21.s }, p4/Z, [x10]\n"
"cmp x28, x20\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z20.s }, p4/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z21.s, z0.s\n"
+ "fmla z14.s, p4/M, z21.s, z1.s\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z18.s, p4/M, z21.s, z2.s\n"
+ "fmla z11.s, p4/M, z20.s, z0.s\n"
"addvl x9, x9, #1\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
+ "fmla z15.s, p4/M, z20.s, z1.s\n"
+ "fmla z19.s, p4/M, z20.s, z2.s\n"
"bne 33b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x13, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z21.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmin z16.s, p4/M, z16.s, z1.s\n"
- "fmin z17.s, p4/M, z17.s, z1.s\n"
- "fmin z18.s, p4/M, z18.s, z1.s\n"
- "fmin z19.s, p4/M, z19.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
- "fmax z16.s, p4/M, z16.s, z0.s\n"
- "fmax z17.s, p4/M, z17.s, z0.s\n"
- "fmax z18.s, p4/M, z18.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z0.s\n"
+ "ld1rw { z20.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z21.s\n"
+ "fmin z9.s, p4/M, z9.s, z21.s\n"
+ "fmin z10.s, p4/M, z10.s, z21.s\n"
+ "fmin z11.s, p4/M, z11.s, z21.s\n"
+ "fmin z12.s, p4/M, z12.s, z21.s\n"
+ "fmin z13.s, p4/M, z13.s, z21.s\n"
+ "fmin z14.s, p4/M, z14.s, z21.s\n"
+ "fmin z15.s, p4/M, z15.s, z21.s\n"
+ "fmin z16.s, p4/M, z16.s, z21.s\n"
+ "fmin z17.s, p4/M, z17.s, z21.s\n"
+ "fmin z18.s, p4/M, z18.s, z21.s\n"
+ "fmin z19.s, p4/M, z19.s, z21.s\n"
+ "fmax z8.s, p4/M, z8.s, z20.s\n"
+ "fmax z9.s, p4/M, z9.s, z20.s\n"
+ "fmax z10.s, p4/M, z10.s, z20.s\n"
+ "fmax z11.s, p4/M, z11.s, z20.s\n"
+ "fmax z12.s, p4/M, z12.s, z20.s\n"
+ "fmax z13.s, p4/M, z13.s, z20.s\n"
+ "fmax z14.s, p4/M, z14.s, z20.s\n"
+ "fmax z15.s, p4/M, z15.s, z20.s\n"
+ "fmax z16.s, p4/M, z16.s, z20.s\n"
+ "fmax z17.s, p4/M, z17.s, z20.s\n"
+ "fmax z18.s, p4/M, z18.s, z20.s\n"
+ "fmax z19.s, p4/M, z19.s, z20.s\n"
"38:" // Height 3: No activation
"st1w { z8.s }, p3, [x13]\n"
"st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -681,25 +681,25 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"43:" // Height 4: no bias
"tbz %x[flags], #0, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x22, x13, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x13]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x23]\n"
- "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x22]\n"
+ "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x21]\n"
+ "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x20]\n"
+ "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 45f\n"
"44:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -723,14 +723,14 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -740,9 +740,9 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 48f\n"
"47:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"48:" // Height 4: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -759,7 +759,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"addvl x11, x11, #1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z25.s }, p4/Z, [x10]\n"
"add x26, x26, #0x4\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
@@ -767,22 +767,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x25, x25, #0x4\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z24.s }, p4/Z, [x9]\n"
"add x24, x24, #0x4\n"
"add x23, x23, #0x4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z25.s, z0.s\n"
+ "fmla z14.s, p4/M, z25.s, z1.s\n"
"addvl x10, x10, #1\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
+ "fmla z18.s, p4/M, z25.s, z2.s\n"
+ "fmla z22.s, p4/M, z25.s, z3.s\n"
"addvl x9, x9, #1\n"
"ld1w { z6.s }, p4/Z, [x12]\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
+ "fmla z11.s, p4/M, z24.s, z0.s\n"
+ "fmla z15.s, p4/M, z24.s, z1.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
+ "fmla z19.s, p4/M, z24.s, z2.s\n"
+ "fmla z23.s, p4/M, z24.s, z3.s\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1w { z7.s }, p4/Z, [x11]\n"
@@ -794,7 +794,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z25.s }, p4/Z, [x10]\n"
"cmp x28, x20\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
@@ -802,17 +802,17 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"addvl x11, x11, #1\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z24.s }, p4/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z25.s, z0.s\n"
+ "fmla z14.s, p4/M, z25.s, z1.s\n"
"addvl x9, x9, #1\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
+ "fmla z18.s, p4/M, z25.s, z2.s\n"
+ "fmla z22.s, p4/M, z25.s, z3.s\n"
+ "fmla z11.s, p4/M, z24.s, z0.s\n"
+ "fmla z15.s, p4/M, z24.s, z1.s\n"
+ "fmla z19.s, p4/M, z24.s, z2.s\n"
+ "fmla z23.s, p4/M, z24.s, z3.s\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x13, x20, LSL #2\n"
@@ -820,41 +820,41 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z25.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmin z16.s, p4/M, z16.s, z1.s\n"
- "fmin z17.s, p4/M, z17.s, z1.s\n"
- "fmin z18.s, p4/M, z18.s, z1.s\n"
- "fmin z19.s, p4/M, z19.s, z1.s\n"
- "fmin z20.s, p4/M, z20.s, z1.s\n"
- "fmin z21.s, p4/M, z21.s, z1.s\n"
- "fmin z22.s, p4/M, z22.s, z1.s\n"
- "fmin z23.s, p4/M, z23.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
- "fmax z16.s, p4/M, z16.s, z0.s\n"
- "fmax z17.s, p4/M, z17.s, z0.s\n"
- "fmax z18.s, p4/M, z18.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z0.s\n"
- "fmax z20.s, p4/M, z20.s, z0.s\n"
- "fmax z21.s, p4/M, z21.s, z0.s\n"
- "fmax z22.s, p4/M, z22.s, z0.s\n"
- "fmax z23.s, p4/M, z23.s, z0.s\n"
+ "ld1rw { z24.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z25.s\n"
+ "fmin z9.s, p4/M, z9.s, z25.s\n"
+ "fmin z10.s, p4/M, z10.s, z25.s\n"
+ "fmin z11.s, p4/M, z11.s, z25.s\n"
+ "fmin z12.s, p4/M, z12.s, z25.s\n"
+ "fmin z13.s, p4/M, z13.s, z25.s\n"
+ "fmin z14.s, p4/M, z14.s, z25.s\n"
+ "fmin z15.s, p4/M, z15.s, z25.s\n"
+ "fmin z16.s, p4/M, z16.s, z25.s\n"
+ "fmin z17.s, p4/M, z17.s, z25.s\n"
+ "fmin z18.s, p4/M, z18.s, z25.s\n"
+ "fmin z19.s, p4/M, z19.s, z25.s\n"
+ "fmin z20.s, p4/M, z20.s, z25.s\n"
+ "fmin z21.s, p4/M, z21.s, z25.s\n"
+ "fmin z22.s, p4/M, z22.s, z25.s\n"
+ "fmin z23.s, p4/M, z23.s, z25.s\n"
+ "fmax z8.s, p4/M, z8.s, z24.s\n"
+ "fmax z9.s, p4/M, z9.s, z24.s\n"
+ "fmax z10.s, p4/M, z10.s, z24.s\n"
+ "fmax z11.s, p4/M, z11.s, z24.s\n"
+ "fmax z12.s, p4/M, z12.s, z24.s\n"
+ "fmax z13.s, p4/M, z13.s, z24.s\n"
+ "fmax z14.s, p4/M, z14.s, z24.s\n"
+ "fmax z15.s, p4/M, z15.s, z24.s\n"
+ "fmax z16.s, p4/M, z16.s, z24.s\n"
+ "fmax z17.s, p4/M, z17.s, z24.s\n"
+ "fmax z18.s, p4/M, z18.s, z24.s\n"
+ "fmax z19.s, p4/M, z19.s, z24.s\n"
+ "fmax z20.s, p4/M, z20.s, z24.s\n"
+ "fmax z21.s, p4/M, z21.s, z24.s\n"
+ "fmax z22.s, p4/M, z22.s, z24.s\n"
+ "fmax z23.s, p4/M, z23.s, z24.s\n"
"51:" // Height 4: No activation
"st1w { z8.s }, p3, [x13]\n"
"st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -939,30 +939,30 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"56:" // Height 5: no bias
"tbz %x[flags], #0, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x13]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x13, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x13]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x23]\n"
- "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p3/Z, [x22]\n"
- "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x23]\n"
+ "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x22]\n"
+ "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x21]\n"
+ "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p3/Z, [x20]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 58f\n"
"57:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -990,15 +990,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"59:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 60f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1009,10 +1009,10 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 61f\n"
"60:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"61:" // Height 5: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1034,7 +1034,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z29.s }, p4/Z, [x10]\n"
"add x25, x25, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
@@ -1042,24 +1042,24 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x23, x23, #0x4\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z28.s }, p4/Z, [x9]\n"
"add x22, x22, #0x4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z29.s, z0.s\n"
+ "fmla z14.s, p4/M, z29.s, z1.s\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
- "fmla z26.s, p4/M, z6.s, z4.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z18.s, p4/M, z29.s, z2.s\n"
+ "fmla z22.s, p4/M, z29.s, z3.s\n"
+ "fmla z26.s, p4/M, z29.s, z4.s\n"
+ "fmla z11.s, p4/M, z28.s, z0.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z6.s }, p4/Z, [x12]\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
+ "fmla z15.s, p4/M, z28.s, z1.s\n"
+ "fmla z19.s, p4/M, z28.s, z2.s\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
- "fmla z27.s, p4/M, z7.s, z4.s\n"
+ "fmla z23.s, p4/M, z28.s, z3.s\n"
+ "fmla z27.s, p4/M, z28.s, z4.s\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1w { z7.s }, p4/Z, [x11]\n"
@@ -1075,25 +1075,25 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"addvl x12, x12, #1\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10]\n"
+ "ld1w { z29.s }, p4/Z, [x10]\n"
"addvl x11, x11, #1\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"addvl x10, x10, #1\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
- "ld1w { z7.s }, p4/Z, [x9]\n"
+ "ld1w { z28.s }, p4/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
- "fmla z26.s, p4/M, z6.s, z4.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
- "fmla z27.s, p4/M, z7.s, z4.s\n"
+ "fmla z10.s, p4/M, z29.s, z0.s\n"
+ "fmla z14.s, p4/M, z29.s, z1.s\n"
+ "fmla z18.s, p4/M, z29.s, z2.s\n"
+ "fmla z22.s, p4/M, z29.s, z3.s\n"
+ "fmla z26.s, p4/M, z29.s, z4.s\n"
+ "fmla z11.s, p4/M, z28.s, z0.s\n"
+ "fmla z15.s, p4/M, z28.s, z1.s\n"
+ "fmla z19.s, p4/M, z28.s, z2.s\n"
+ "fmla z23.s, p4/M, z28.s, z3.s\n"
+ "fmla z27.s, p4/M, z28.s, z4.s\n"
"bne 59b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x13, x20, LSL #2\n"
@@ -1102,49 +1102,49 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z29.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmin z16.s, p4/M, z16.s, z1.s\n"
- "fmin z17.s, p4/M, z17.s, z1.s\n"
- "fmin z18.s, p4/M, z18.s, z1.s\n"
- "fmin z19.s, p4/M, z19.s, z1.s\n"
- "fmin z20.s, p4/M, z20.s, z1.s\n"
- "fmin z21.s, p4/M, z21.s, z1.s\n"
- "fmin z22.s, p4/M, z22.s, z1.s\n"
- "fmin z23.s, p4/M, z23.s, z1.s\n"
- "fmin z24.s, p4/M, z24.s, z1.s\n"
- "fmin z25.s, p4/M, z25.s, z1.s\n"
- "fmin z26.s, p4/M, z26.s, z1.s\n"
- "fmin z27.s, p4/M, z27.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
- "fmax z16.s, p4/M, z16.s, z0.s\n"
- "fmax z17.s, p4/M, z17.s, z0.s\n"
- "fmax z18.s, p4/M, z18.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z0.s\n"
- "fmax z20.s, p4/M, z20.s, z0.s\n"
- "fmax z21.s, p4/M, z21.s, z0.s\n"
- "fmax z22.s, p4/M, z22.s, z0.s\n"
- "fmax z23.s, p4/M, z23.s, z0.s\n"
- "fmax z24.s, p4/M, z24.s, z0.s\n"
- "fmax z25.s, p4/M, z25.s, z0.s\n"
- "fmax z26.s, p4/M, z26.s, z0.s\n"
- "fmax z27.s, p4/M, z27.s, z0.s\n"
+ "ld1rw { z28.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z29.s\n"
+ "fmin z9.s, p4/M, z9.s, z29.s\n"
+ "fmin z10.s, p4/M, z10.s, z29.s\n"
+ "fmin z11.s, p4/M, z11.s, z29.s\n"
+ "fmin z12.s, p4/M, z12.s, z29.s\n"
+ "fmin z13.s, p4/M, z13.s, z29.s\n"
+ "fmin z14.s, p4/M, z14.s, z29.s\n"
+ "fmin z15.s, p4/M, z15.s, z29.s\n"
+ "fmin z16.s, p4/M, z16.s, z29.s\n"
+ "fmin z17.s, p4/M, z17.s, z29.s\n"
+ "fmin z18.s, p4/M, z18.s, z29.s\n"
+ "fmin z19.s, p4/M, z19.s, z29.s\n"
+ "fmin z20.s, p4/M, z20.s, z29.s\n"
+ "fmin z21.s, p4/M, z21.s, z29.s\n"
+ "fmin z22.s, p4/M, z22.s, z29.s\n"
+ "fmin z23.s, p4/M, z23.s, z29.s\n"
+ "fmin z24.s, p4/M, z24.s, z29.s\n"
+ "fmin z25.s, p4/M, z25.s, z29.s\n"
+ "fmin z26.s, p4/M, z26.s, z29.s\n"
+ "fmin z27.s, p4/M, z27.s, z29.s\n"
+ "fmax z8.s, p4/M, z8.s, z28.s\n"
+ "fmax z9.s, p4/M, z9.s, z28.s\n"
+ "fmax z10.s, p4/M, z10.s, z28.s\n"
+ "fmax z11.s, p4/M, z11.s, z28.s\n"
+ "fmax z12.s, p4/M, z12.s, z28.s\n"
+ "fmax z13.s, p4/M, z13.s, z28.s\n"
+ "fmax z14.s, p4/M, z14.s, z28.s\n"
+ "fmax z15.s, p4/M, z15.s, z28.s\n"
+ "fmax z16.s, p4/M, z16.s, z28.s\n"
+ "fmax z17.s, p4/M, z17.s, z28.s\n"
+ "fmax z18.s, p4/M, z18.s, z28.s\n"
+ "fmax z19.s, p4/M, z19.s, z28.s\n"
+ "fmax z20.s, p4/M, z20.s, z28.s\n"
+ "fmax z21.s, p4/M, z21.s, z28.s\n"
+ "fmax z22.s, p4/M, z22.s, z28.s\n"
+ "fmax z23.s, p4/M, z23.s, z28.s\n"
+ "fmax z24.s, p4/M, z24.s, z28.s\n"
+ "fmax z25.s, p4/M, z25.s, z28.s\n"
+ "fmax z26.s, p4/M, z26.s, z28.s\n"
+ "fmax z27.s, p4/M, z27.s, z28.s\n"
"64:" // Height 5: No activation
"st1w { z8.s }, p3, [x13]\n"
"st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -1240,35 +1240,35 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"69:" // Height 6: no bias
"tbz %x[flags], #0, 70f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x13]\n"
+ "add x24, x13, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x13]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x23]\n"
- "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p3/Z, [x22]\n"
- "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p3/Z, [x21]\n"
- "ld1w { z29.s }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x24]\n"
+ "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x23]\n"
+ "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x22]\n"
+ "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p3/Z, [x21]\n"
+ "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p3/Z, [x20]\n"
+ "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 71f\n"
"70:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1300,16 +1300,16 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"72:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 73f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 74f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1321,11 +1321,11 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
"b 74f\n"
"73:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"74:" // Height 6: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1527,4 +1527,4 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
index eb057e7734..b1ab31e618 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
@@ -163,11 +163,11 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 9f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -180,40 +180,40 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10]\n"
+ "fmla z10.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z11.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z11.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[3]\n"
+ "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
"sub x27, x27, #0x4\n"
- "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
"cmp x27, #0x4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z10.s, z17.s, z0.s[3]\n"
+ "fmla z11.s, z16.s, z0.s[3]\n"
"add x26, x26, #0x10\n"
"addvl x12, x12, #4\n"
"addvl x11, x11, #4\n"
@@ -223,56 +223,56 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z17.s, z0.s[0]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[1]\n"
+ "fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z10.s, z17.s, z0.s[1]\n"
+ "fmla z11.s, z16.s, z0.s[1]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[2]\n"
+ "fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z10.s, z17.s, z0.s[2]\n"
+ "fmla z11.s, z16.s, z0.s[2]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 12f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[3]\n"
+ "fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z17.s, z0.s[3]\n"
+ "fmla z11.s, z16.s, z0.s[3]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
@@ -284,17 +284,17 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"bne 7b\n"
"tbz %x[flags], #1, 13f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"13:" // Height 1: No activation
"st1w { z8.s }, p4, [x13]\n"
"st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -355,15 +355,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
+ "add x20, x13, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 20f\n"
"19:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -379,12 +379,12 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -392,143 +392,143 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 23f\n"
"22:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"23:" // Height 2: input setup done
"cmp x27, #0x4\n"
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z1.s[0]\n"
+ "fmla z12.s, z17.s, z0.s[0]\n"
+ "fmla z9.s, z16.s, z1.s[0]\n"
+ "fmla z13.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z17.s, z1.s[0]\n"
+ "fmla z14.s, z17.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x12, #1, MUL VL]\n"
"cmp x27, #0x4\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z11.s, z16.s, z1.s[0]\n"
+ "fmla z15.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
"add x26, x26, #0x10\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[1]\n"
+ "fmla z12.s, z17.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #1, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z9.s, z16.s, z1.s[1]\n"
+ "fmla z13.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.s, z17.s, z1.s[1]\n"
+ "fmla z14.s, z17.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z11.s, z16.s, z1.s[1]\n"
+ "fmla z15.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[2]\n"
+ "fmla z12.s, z17.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z16.s, z1.s[2]\n"
+ "fmla z13.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.s, z17.s, z1.s[2]\n"
+ "fmla z14.s, z17.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z11.s, z16.s, z1.s[2]\n"
+ "fmla z15.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[3]\n"
+ "fmla z12.s, z17.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z9.s, z16.s, z1.s[3]\n"
+ "fmla z13.s, z16.s, z0.s[3]\n"
+ "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z10.s, z17.s, z1.s[3]\n"
+ "fmla z14.s, z17.s, z0.s[3]\n"
+ "fmla z11.s, z16.s, z1.s[3]\n"
+ "fmla z15.s, z16.s, z0.s[3]\n"
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[0]\n"
+ "fmla z12.s, z17.s, z1.s[0]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "fmla z13.s, z16.s, z1.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z17.s, z0.s[0]\n"
+ "fmla z14.s, z17.s, z1.s[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
+ "fmla z15.s, z16.s, z1.s[0]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[1]\n"
+ "fmla z12.s, z17.s, z1.s[1]\n"
+ "fmla z9.s, z16.s, z0.s[1]\n"
+ "fmla z13.s, z16.s, z1.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z10.s, z17.s, z0.s[1]\n"
+ "fmla z14.s, z17.s, z1.s[1]\n"
"addvl x12, x12, #1\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z11.s, z16.s, z0.s[1]\n"
+ "fmla z15.s, z16.s, z1.s[1]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[2]\n"
+ "fmla z12.s, z17.s, z1.s[2]\n"
+ "fmla z9.s, z16.s, z0.s[2]\n"
+ "fmla z13.s, z16.s, z1.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z10.s, z17.s, z0.s[2]\n"
+ "fmla z14.s, z17.s, z1.s[2]\n"
"addvl x12, x12, #1\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z11.s, z16.s, z0.s[2]\n"
+ "fmla z15.s, z16.s, z1.s[2]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"ble 26f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x12]\n"
+ "ld1w { z16.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z17.s, z0.s[3]\n"
+ "fmla z12.s, z17.s, z1.s[3]\n"
+ "fmla z9.s, z16.s, z0.s[3]\n"
+ "fmla z13.s, z16.s, z1.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z17.s, z0.s[3]\n"
+ "fmla z14.s, z17.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z11.s, z16.s, z0.s[3]\n"
+ "fmla z15.s, z16.s, z1.s[3]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
"26:" // Height 2: Multiply loop: multiply skip
@@ -540,25 +540,25 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"add x25, x13, x20, LSL #2\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmin z12.s, p5/M, z12.s, z17.s\n"
+ "fmin z13.s, p5/M, z13.s, z17.s\n"
+ "fmin z14.s, p5/M, z14.s, z17.s\n"
+ "fmin z15.s, p5/M, z15.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
+ "fmax z12.s, p5/M, z12.s, z16.s\n"
+ "fmax z13.s, p5/M, z13.s, z16.s\n"
+ "fmax z14.s, p5/M, z14.s, z16.s\n"
+ "fmax z15.s, p5/M, z15.s, z16.s\n"
"27:" // Height 2: No activation
"st1w { z8.s }, p4, [x13]\n"
"st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -627,20 +627,20 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x21, x13, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20]\n"
+ "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 34f\n"
"33:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -660,13 +660,13 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 37f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -675,89 +675,89 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 37f\n"
"36:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"37:" // Height 3: input setup done
"cmp x27, #0x4\n"
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "ld1rqw { z0.s }, p0/Z, [x24]\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z21.s, z2.s[0]\n"
+ "fmla z12.s, z21.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
+ "fmla z16.s, z21.s, z0.s[0]\n"
+ "fmla z9.s, z20.s, z2.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
+ "fmla z17.s, z20.s, z0.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x9]\n"
"cmp x27, #0x4\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z10.s, z21.s, z2.s[0]\n"
+ "fmla z14.s, z21.s, z1.s[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z18.s, z21.s, z0.s[0]\n"
+ "fmla z11.s, z20.s, z2.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x12, #1, MUL VL]\n"
"add x24, x24, #0x10\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z15.s, z20.s, z1.s[0]\n"
+ "fmla z19.s, z20.s, z0.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.s, z21.s, z2.s[1]\n"
+ "fmla z12.s, z21.s, z1.s[1]\n"
+ "fmla z16.s, z21.s, z0.s[1]\n"
+ "fmla z9.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[1]\n"
+ "ld1w { z20.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.s, z21.s, z2.s[1]\n"
+ "fmla z14.s, z21.s, z1.s[1]\n"
+ "fmla z18.s, z21.s, z0.s[1]\n"
+ "fmla z11.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z15.s, z20.s, z1.s[1]\n"
+ "fmla z19.s, z20.s, z0.s[1]\n"
+ "ld1w { z20.s }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.s, z21.s, z2.s[2]\n"
+ "fmla z12.s, z21.s, z1.s[2]\n"
+ "fmla z16.s, z21.s, z0.s[2]\n"
+ "fmla z9.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[2]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "ld1w { z20.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.s, z21.s, z2.s[2]\n"
+ "fmla z14.s, z21.s, z1.s[2]\n"
+ "fmla z18.s, z21.s, z0.s[2]\n"
+ "fmla z11.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z15.s, z20.s, z1.s[2]\n"
+ "fmla z19.s, z20.s, z0.s[2]\n"
+ "ld1w { z20.s }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z8.s, z21.s, z2.s[3]\n"
+ "fmla z12.s, z21.s, z1.s[3]\n"
+ "fmla z16.s, z21.s, z0.s[3]\n"
+ "fmla z9.s, z20.s, z2.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[3]\n"
+ "fmla z17.s, z20.s, z0.s[3]\n"
+ "ld1w { z20.s }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z10.s, z21.s, z2.s[3]\n"
+ "fmla z14.s, z21.s, z1.s[3]\n"
+ "fmla z18.s, z21.s, z0.s[3]\n"
+ "fmla z11.s, z20.s, z2.s[3]\n"
+ "fmla z15.s, z20.s, z1.s[3]\n"
+ "fmla z19.s, z20.s, z0.s[3]\n"
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -765,91 +765,91 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ld1rqw { z1.s }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z21.s, z0.s[0]\n"
+ "fmla z12.s, z21.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
+ "fmla z16.s, z21.s, z2.s[0]\n"
+ "fmla z9.s, z20.s, z0.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
+ "fmla z17.s, z20.s, z2.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z10.s, z21.s, z0.s[0]\n"
+ "fmla z14.s, z21.s, z1.s[0]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z18.s, z21.s, z2.s[0]\n"
+ "fmla z11.s, z20.s, z0.s[0]\n"
"addvl x9, x9, #1\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z15.s, z20.s, z1.s[0]\n"
+ "fmla z19.s, z20.s, z2.s[0]\n"
"ble 40f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z21.s, z0.s[1]\n"
+ "fmla z12.s, z21.s, z1.s[1]\n"
+ "fmla z16.s, z21.s, z2.s[1]\n"
+ "fmla z9.s, z20.s, z0.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z13.s, z20.s, z1.s[1]\n"
+ "fmla z17.s, z20.s, z2.s[1]\n"
+ "ld1w { z20.s }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z10.s, z21.s, z0.s[1]\n"
+ "fmla z14.s, z21.s, z1.s[1]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z18.s, z21.s, z2.s[1]\n"
+ "fmla z11.s, z20.s, z0.s[1]\n"
"addvl x9, x9, #1\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z15.s, z20.s, z1.s[1]\n"
+ "fmla z19.s, z20.s, z2.s[1]\n"
"ble 40f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z21.s, z0.s[2]\n"
+ "fmla z12.s, z21.s, z1.s[2]\n"
+ "fmla z16.s, z21.s, z2.s[2]\n"
+ "fmla z9.s, z20.s, z0.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z13.s, z20.s, z1.s[2]\n"
+ "fmla z17.s, z20.s, z2.s[2]\n"
+ "ld1w { z20.s }, p5/Z, [x9]\n"
"addvl x12, x12, #1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z10.s, z21.s, z0.s[2]\n"
+ "fmla z14.s, z21.s, z1.s[2]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z18.s, z21.s, z2.s[2]\n"
+ "fmla z11.s, z20.s, z0.s[2]\n"
"addvl x9, x9, #1\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z15.s, z20.s, z1.s[2]\n"
+ "fmla z19.s, z20.s, z2.s[2]\n"
"ble 40f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z21.s }, p5/Z, [x12]\n"
+ "ld1w { z20.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z21.s, z0.s[3]\n"
+ "fmla z12.s, z21.s, z1.s[3]\n"
+ "fmla z16.s, z21.s, z2.s[3]\n"
+ "fmla z9.s, z20.s, z0.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
"addvl x12, x12, #1\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z13.s, z20.s, z1.s[3]\n"
+ "fmla z17.s, z20.s, z2.s[3]\n"
+ "ld1w { z20.s }, p5/Z, [x9]\n"
"addvl x11, x11, #1\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z10.s, z21.s, z0.s[3]\n"
+ "fmla z14.s, z21.s, z1.s[3]\n"
"addvl x10, x10, #1\n"
"addvl x9, x9, #1\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z18.s, z21.s, z2.s[3]\n"
+ "fmla z11.s, z20.s, z0.s[3]\n"
+ "fmla z15.s, z20.s, z1.s[3]\n"
+ "fmla z19.s, z20.s, z2.s[3]\n"
"40:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -860,33 +860,33 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 41f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z20.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
+ "fmin z12.s, p5/M, z12.s, z21.s\n"
+ "fmin z13.s, p5/M, z13.s, z21.s\n"
+ "fmin z14.s, p5/M, z14.s, z21.s\n"
+ "fmin z15.s, p5/M, z15.s, z21.s\n"
+ "fmin z16.s, p5/M, z16.s, z21.s\n"
+ "fmin z17.s, p5/M, z17.s, z21.s\n"
+ "fmin z18.s, p5/M, z18.s, z21.s\n"
+ "fmin z19.s, p5/M, z19.s, z21.s\n"
+ "fmax z8.s, p5/M, z8.s, z20.s\n"
+ "fmax z9.s, p5/M, z9.s, z20.s\n"
+ "fmax z10.s, p5/M, z10.s, z20.s\n"
+ "fmax z11.s, p5/M, z11.s, z20.s\n"
+ "fmax z12.s, p5/M, z12.s, z20.s\n"
+ "fmax z13.s, p5/M, z13.s, z20.s\n"
+ "fmax z14.s, p5/M, z14.s, z20.s\n"
+ "fmax z15.s, p5/M, z15.s, z20.s\n"
+ "fmax z16.s, p5/M, z16.s, z20.s\n"
+ "fmax z17.s, p5/M, z17.s, z20.s\n"
+ "fmax z18.s, p5/M, z18.s, z20.s\n"
+ "fmax z19.s, p5/M, z19.s, z20.s\n"
"41:" // Height 3: No activation
"st1w { z8.s }, p4, [x13]\n"
"st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -963,25 +963,25 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x22, x13, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x13]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21]\n"
+ "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 48f\n"
"47:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -1005,14 +1005,14 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1022,108 +1022,108 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 51f\n"
"50:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"51:" // Height 4: input setup done
"cmp x27, #0x4\n"
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z3.s }, p0/Z, [x26]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z25.s, z3.s[0]\n"
+ "fmla z12.s, z25.s, z2.s[0]\n"
+ "fmla z16.s, z25.s, z1.s[0]\n"
+ "fmla z20.s, z25.s, z0.s[0]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
"add x25, x25, #0x10\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z9.s, z24.s, z3.s[0]\n"
+ "fmla z13.s, z24.s, z2.s[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z17.s, z24.s, z1.s[0]\n"
+ "fmla z21.s, z24.s, z0.s[0]\n"
+ "ld1w { z24.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z25.s, z3.s[0]\n"
+ "fmla z14.s, z25.s, z2.s[0]\n"
+ "fmla z18.s, z25.s, z1.s[0]\n"
+ "fmla z22.s, z25.s, z0.s[0]\n"
+ "ld1w { z25.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z11.s, z24.s, z3.s[0]\n"
+ "fmla z15.s, z24.s, z2.s[0]\n"
+ "fmla z19.s, z24.s, z1.s[0]\n"
+ "fmla z23.s, z24.s, z0.s[0]\n"
+ "ld1w { z24.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[1]\n"
+ "fmla z12.s, z25.s, z2.s[1]\n"
+ "fmla z16.s, z25.s, z1.s[1]\n"
+ "fmla z20.s, z25.s, z0.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.s, z24.s, z3.s[1]\n"
+ "fmla z13.s, z24.s, z2.s[1]\n"
+ "fmla z17.s, z24.s, z1.s[1]\n"
+ "fmla z21.s, z24.s, z0.s[1]\n"
+ "ld1w { z24.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.s, z25.s, z3.s[1]\n"
+ "fmla z14.s, z25.s, z2.s[1]\n"
+ "fmla z18.s, z25.s, z1.s[1]\n"
+ "fmla z22.s, z25.s, z0.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z11.s, z24.s, z3.s[1]\n"
+ "fmla z15.s, z24.s, z2.s[1]\n"
+ "fmla z19.s, z24.s, z1.s[1]\n"
+ "fmla z23.s, z24.s, z0.s[1]\n"
+ "ld1w { z24.s }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[2]\n"
+ "fmla z12.s, z25.s, z2.s[2]\n"
+ "fmla z16.s, z25.s, z1.s[2]\n"
+ "fmla z20.s, z25.s, z0.s[2]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z24.s, z3.s[2]\n"
+ "fmla z13.s, z24.s, z2.s[2]\n"
+ "fmla z17.s, z24.s, z1.s[2]\n"
+ "fmla z21.s, z24.s, z0.s[2]\n"
+ "ld1w { z24.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.s, z25.s, z3.s[2]\n"
+ "fmla z14.s, z25.s, z2.s[2]\n"
+ "fmla z18.s, z25.s, z1.s[2]\n"
+ "fmla z22.s, z25.s, z0.s[2]\n"
+ "ld1w { z25.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z11.s, z24.s, z3.s[2]\n"
+ "fmla z15.s, z24.s, z2.s[2]\n"
+ "fmla z19.s, z24.s, z1.s[2]\n"
+ "fmla z23.s, z24.s, z0.s[2]\n"
+ "ld1w { z24.s }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[3]\n"
+ "fmla z12.s, z25.s, z2.s[3]\n"
+ "fmla z16.s, z25.s, z1.s[3]\n"
+ "fmla z20.s, z25.s, z0.s[3]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z9.s, z24.s, z3.s[3]\n"
+ "fmla z13.s, z24.s, z2.s[3]\n"
+ "fmla z17.s, z24.s, z1.s[3]\n"
+ "fmla z21.s, z24.s, z0.s[3]\n"
+ "ld1w { z24.s }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z10.s, z25.s, z3.s[3]\n"
+ "fmla z14.s, z25.s, z2.s[3]\n"
+ "fmla z18.s, z25.s, z1.s[3]\n"
+ "fmla z22.s, z25.s, z0.s[3]\n"
+ "fmla z11.s, z24.s, z3.s[3]\n"
+ "fmla z15.s, z24.s, z2.s[3]\n"
+ "fmla z19.s, z24.s, z1.s[3]\n"
+ "fmla z23.s, z24.s, z0.s[3]\n"
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -1132,107 +1132,107 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z25.s, z0.s[0]\n"
+ "fmla z12.s, z25.s, z1.s[0]\n"
+ "fmla z16.s, z25.s, z2.s[0]\n"
+ "fmla z20.s, z25.s, z3.s[0]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
"addvl x12, x12, #1\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z9.s, z24.s, z0.s[0]\n"
+ "fmla z13.s, z24.s, z1.s[0]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z17.s, z24.s, z2.s[0]\n"
+ "fmla z21.s, z24.s, z3.s[0]\n"
+ "ld1w { z24.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z10.s, z25.s, z0.s[0]\n"
+ "fmla z14.s, z25.s, z1.s[0]\n"
+ "fmla z18.s, z25.s, z2.s[0]\n"
+ "fmla z22.s, z25.s, z3.s[0]\n"
+ "fmla z11.s, z24.s, z0.s[0]\n"
+ "fmla z15.s, z24.s, z1.s[0]\n"
+ "fmla z19.s, z24.s, z2.s[0]\n"
+ "fmla z23.s, z24.s, z3.s[0]\n"
"ble 54f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z25.s, z0.s[1]\n"
+ "fmla z12.s, z25.s, z1.s[1]\n"
+ "fmla z16.s, z25.s, z2.s[1]\n"
+ "fmla z20.s, z25.s, z3.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z9.s, z24.s, z0.s[1]\n"
+ "fmla z13.s, z24.s, z1.s[1]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z17.s, z24.s, z2.s[1]\n"
+ "fmla z21.s, z24.s, z3.s[1]\n"
+ "ld1w { z24.s }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z10.s, z25.s, z0.s[1]\n"
+ "fmla z14.s, z25.s, z1.s[1]\n"
"addvl x9, x9, #1\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z18.s, z25.s, z2.s[1]\n"
+ "fmla z22.s, z25.s, z3.s[1]\n"
+ "fmla z11.s, z24.s, z0.s[1]\n"
+ "fmla z15.s, z24.s, z1.s[1]\n"
+ "fmla z19.s, z24.s, z2.s[1]\n"
+ "fmla z23.s, z24.s, z3.s[1]\n"
"ble 54f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z25.s, z0.s[2]\n"
+ "fmla z12.s, z25.s, z1.s[2]\n"
+ "fmla z16.s, z25.s, z2.s[2]\n"
+ "fmla z20.s, z25.s, z3.s[2]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z9.s, z24.s, z0.s[2]\n"
+ "fmla z13.s, z24.s, z1.s[2]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z17.s, z24.s, z2.s[2]\n"
+ "fmla z21.s, z24.s, z3.s[2]\n"
+ "ld1w { z24.s }, p5/Z, [x9]\n"
"addvl x10, x10, #1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z10.s, z25.s, z0.s[2]\n"
+ "fmla z14.s, z25.s, z1.s[2]\n"
"addvl x9, x9, #1\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z18.s, z25.s, z2.s[2]\n"
+ "fmla z22.s, z25.s, z3.s[2]\n"
+ "fmla z11.s, z24.s, z0.s[2]\n"
+ "fmla z15.s, z24.s, z1.s[2]\n"
+ "fmla z19.s, z24.s, z2.s[2]\n"
+ "fmla z23.s, z24.s, z3.s[2]\n"
"ble 54f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "ld1w { z25.s }, p5/Z, [x12]\n"
+ "ld1w { z24.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z25.s, z0.s[3]\n"
+ "fmla z12.s, z25.s, z1.s[3]\n"
+ "fmla z16.s, z25.s, z2.s[3]\n"
+ "fmla z20.s, z25.s, z3.s[3]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
"addvl x12, x12, #1\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z9.s, z24.s, z0.s[3]\n"
+ "fmla z13.s, z24.s, z1.s[3]\n"
"addvl x11, x11, #1\n"
"addvl x10, x10, #1\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z17.s, z24.s, z2.s[3]\n"
+ "fmla z21.s, z24.s, z3.s[3]\n"
+ "ld1w { z24.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z10.s, z25.s, z0.s[3]\n"
+ "fmla z14.s, z25.s, z1.s[3]\n"
+ "fmla z18.s, z25.s, z2.s[3]\n"
+ "fmla z22.s, z25.s, z3.s[3]\n"
+ "fmla z11.s, z24.s, z0.s[3]\n"
+ "fmla z15.s, z24.s, z1.s[3]\n"
+ "fmla z19.s, z24.s, z2.s[3]\n"
+ "fmla z23.s, z24.s, z3.s[3]\n"
"54:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1244,41 +1244,41 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 55f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z23.s, p5/M, z23.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z23.s, p5/M, z23.s, z0.s\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z25.s\n"
+ "fmin z9.s, p5/M, z9.s, z25.s\n"
+ "fmin z10.s, p5/M, z10.s, z25.s\n"
+ "fmin z11.s, p5/M, z11.s, z25.s\n"
+ "fmin z12.s, p5/M, z12.s, z25.s\n"
+ "fmin z13.s, p5/M, z13.s, z25.s\n"
+ "fmin z14.s, p5/M, z14.s, z25.s\n"
+ "fmin z15.s, p5/M, z15.s, z25.s\n"
+ "fmin z16.s, p5/M, z16.s, z25.s\n"
+ "fmin z17.s, p5/M, z17.s, z25.s\n"
+ "fmin z18.s, p5/M, z18.s, z25.s\n"
+ "fmin z19.s, p5/M, z19.s, z25.s\n"
+ "fmin z20.s, p5/M, z20.s, z25.s\n"
+ "fmin z21.s, p5/M, z21.s, z25.s\n"
+ "fmin z22.s, p5/M, z22.s, z25.s\n"
+ "fmin z23.s, p5/M, z23.s, z25.s\n"
+ "fmax z8.s, p5/M, z8.s, z24.s\n"
+ "fmax z9.s, p5/M, z9.s, z24.s\n"
+ "fmax z10.s, p5/M, z10.s, z24.s\n"
+ "fmax z11.s, p5/M, z11.s, z24.s\n"
+ "fmax z12.s, p5/M, z12.s, z24.s\n"
+ "fmax z13.s, p5/M, z13.s, z24.s\n"
+ "fmax z14.s, p5/M, z14.s, z24.s\n"
+ "fmax z15.s, p5/M, z15.s, z24.s\n"
+ "fmax z16.s, p5/M, z16.s, z24.s\n"
+ "fmax z17.s, p5/M, z17.s, z24.s\n"
+ "fmax z18.s, p5/M, z18.s, z24.s\n"
+ "fmax z19.s, p5/M, z19.s, z24.s\n"
+ "fmax z20.s, p5/M, z20.s, z24.s\n"
+ "fmax z21.s, p5/M, z21.s, z24.s\n"
+ "fmax z22.s, p5/M, z22.s, z24.s\n"
+ "fmax z23.s, p5/M, z23.s, z24.s\n"
"55:" // Height 4: No activation
"st1w { z8.s }, p4, [x13]\n"
"st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -1363,30 +1363,30 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"60:" // Height 5: no bias
"tbz %x[flags], #0, 61f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x13, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x22]\n"
- "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 62f\n"
"61:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1414,15 +1414,15 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"63:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 64f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 65f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1433,127 +1433,127 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 65f\n"
"64:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"65:" // Height 5: input setup done
"cmp x27, #0x4\n"
"ble 67f\n"
"66:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "ld1rqw { z3.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1rqw { z1.s }, p0/Z, [x23]\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1rqw { z0.s }, p0/Z, [x22]\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z29.s, z4.s[0]\n"
+ "fmla z12.s, z29.s, z3.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
+ "fmla z16.s, z29.s, z2.s[0]\n"
+ "fmla z20.s, z29.s, z1.s[0]\n"
"add x25, x25, #0x10\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z29.s, z0.s[0]\n"
+ "fmla z9.s, z28.s, z4.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"add x24, x24, #0x10\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z13.s, z28.s, z3.s[0]\n"
+ "fmla z17.s, z28.s, z2.s[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z21.s, z28.s, z1.s[0]\n"
+ "fmla z25.s, z28.s, z0.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z29.s, z4.s[0]\n"
+ "fmla z14.s, z29.s, z3.s[0]\n"
+ "fmla z18.s, z29.s, z2.s[0]\n"
+ "fmla z22.s, z29.s, z1.s[0]\n"
+ "fmla z26.s, z29.s, z0.s[0]\n"
+ "fmla z11.s, z28.s, z4.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[0]\n"
+ "fmla z19.s, z28.s, z2.s[0]\n"
+ "fmla z23.s, z28.s, z1.s[0]\n"
+ "fmla z27.s, z28.s, z0.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.s, z29.s, z4.s[1]\n"
+ "fmla z12.s, z29.s, z3.s[1]\n"
+ "fmla z16.s, z29.s, z2.s[1]\n"
+ "fmla z20.s, z29.s, z1.s[1]\n"
+ "fmla z24.s, z29.s, z0.s[1]\n"
+ "fmla z9.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z13.s, z28.s, z3.s[1]\n"
+ "fmla z17.s, z28.s, z2.s[1]\n"
+ "fmla z21.s, z28.s, z1.s[1]\n"
+ "fmla z25.s, z28.s, z0.s[1]\n"
+ "ld1w { z28.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.s, z29.s, z4.s[1]\n"
+ "fmla z14.s, z29.s, z3.s[1]\n"
+ "fmla z18.s, z29.s, z2.s[1]\n"
+ "fmla z22.s, z29.s, z1.s[1]\n"
+ "fmla z26.s, z29.s, z0.s[1]\n"
+ "fmla z11.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[1]\n"
+ "fmla z19.s, z28.s, z2.s[1]\n"
+ "fmla z23.s, z28.s, z1.s[1]\n"
+ "fmla z27.s, z28.s, z0.s[1]\n"
+ "ld1w { z28.s }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.s, z29.s, z4.s[2]\n"
+ "fmla z12.s, z29.s, z3.s[2]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z24.s, z29.s, z0.s[2]\n"
+ "fmla z9.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z28.s, z3.s[2]\n"
+ "fmla z17.s, z28.s, z2.s[2]\n"
+ "fmla z21.s, z28.s, z1.s[2]\n"
+ "fmla z25.s, z28.s, z0.s[2]\n"
+ "ld1w { z28.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.s, z29.s, z4.s[2]\n"
+ "fmla z14.s, z29.s, z3.s[2]\n"
+ "fmla z18.s, z29.s, z2.s[2]\n"
+ "fmla z22.s, z29.s, z1.s[2]\n"
+ "fmla z26.s, z29.s, z0.s[2]\n"
+ "fmla z11.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[2]\n"
+ "fmla z19.s, z28.s, z2.s[2]\n"
+ "fmla z23.s, z28.s, z1.s[2]\n"
+ "fmla z27.s, z28.s, z0.s[2]\n"
+ "ld1w { z28.s }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z8.s, z29.s, z4.s[3]\n"
+ "fmla z12.s, z29.s, z3.s[3]\n"
+ "fmla z16.s, z29.s, z2.s[3]\n"
+ "fmla z20.s, z29.s, z1.s[3]\n"
+ "fmla z24.s, z29.s, z0.s[3]\n"
+ "fmla z9.s, z28.s, z4.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z13.s, z28.s, z3.s[3]\n"
+ "fmla z17.s, z28.s, z2.s[3]\n"
+ "fmla z21.s, z28.s, z1.s[3]\n"
+ "fmla z25.s, z28.s, z0.s[3]\n"
+ "ld1w { z28.s }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z10.s, z29.s, z4.s[3]\n"
+ "fmla z14.s, z29.s, z3.s[3]\n"
+ "fmla z18.s, z29.s, z2.s[3]\n"
+ "fmla z22.s, z29.s, z1.s[3]\n"
+ "fmla z26.s, z29.s, z0.s[3]\n"
+ "fmla z11.s, z28.s, z4.s[3]\n"
+ "fmla z15.s, z28.s, z3.s[3]\n"
+ "fmla z19.s, z28.s, z2.s[3]\n"
+ "fmla z23.s, z28.s, z1.s[3]\n"
+ "fmla z27.s, z28.s, z0.s[3]\n"
"bgt 66b\n"
"67:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -1563,123 +1563,123 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
+ "fmla z8.s, z29.s, z0.s[0]\n"
+ "fmla z12.s, z29.s, z1.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
+ "fmla z16.s, z29.s, z2.s[0]\n"
+ "fmla z20.s, z29.s, z3.s[0]\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z29.s, z4.s[0]\n"
+ "fmla z9.s, z28.s, z0.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z13.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z2.s[0]\n"
"addvl x10, x10, #1\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z21.s, z28.s, z3.s[0]\n"
+ "fmla z25.s, z28.s, z4.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
+ "fmla z10.s, z29.s, z0.s[0]\n"
+ "fmla z14.s, z29.s, z1.s[0]\n"
+ "fmla z18.s, z29.s, z2.s[0]\n"
+ "fmla z22.s, z29.s, z3.s[0]\n"
+ "fmla z26.s, z29.s, z4.s[0]\n"
+ "fmla z11.s, z28.s, z0.s[0]\n"
+ "fmla z15.s, z28.s, z1.s[0]\n"
+ "fmla z19.s, z28.s, z2.s[0]\n"
+ "fmla z23.s, z28.s, z3.s[0]\n"
+ "fmla z27.s, z28.s, z4.s[0]\n"
"ble 68f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z29.s, z0.s[1]\n"
+ "fmla z12.s, z29.s, z1.s[1]\n"
+ "fmla z16.s, z29.s, z2.s[1]\n"
+ "fmla z20.s, z29.s, z3.s[1]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z29.s, z4.s[1]\n"
+ "fmla z9.s, z28.s, z0.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z13.s, z28.s, z1.s[1]\n"
+ "fmla z17.s, z28.s, z2.s[1]\n"
"addvl x10, x10, #1\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z21.s, z28.s, z3.s[1]\n"
+ "fmla z25.s, z28.s, z4.s[1]\n"
+ "ld1w { z28.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
+ "fmla z10.s, z29.s, z0.s[1]\n"
+ "fmla z14.s, z29.s, z1.s[1]\n"
+ "fmla z18.s, z29.s, z2.s[1]\n"
+ "fmla z22.s, z29.s, z3.s[1]\n"
+ "fmla z26.s, z29.s, z4.s[1]\n"
+ "fmla z11.s, z28.s, z0.s[1]\n"
+ "fmla z15.s, z28.s, z1.s[1]\n"
+ "fmla z19.s, z28.s, z2.s[1]\n"
+ "fmla z23.s, z28.s, z3.s[1]\n"
+ "fmla z27.s, z28.s, z4.s[1]\n"
"ble 68f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z29.s, z0.s[2]\n"
+ "fmla z12.s, z29.s, z1.s[2]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z20.s, z29.s, z3.s[2]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z29.s, z4.s[2]\n"
+ "fmla z9.s, z28.s, z0.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z13.s, z28.s, z1.s[2]\n"
+ "fmla z17.s, z28.s, z2.s[2]\n"
"addvl x10, x10, #1\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z21.s, z28.s, z3.s[2]\n"
+ "fmla z25.s, z28.s, z4.s[2]\n"
+ "ld1w { z28.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
+ "fmla z10.s, z29.s, z0.s[2]\n"
+ "fmla z14.s, z29.s, z1.s[2]\n"
+ "fmla z18.s, z29.s, z2.s[2]\n"
+ "fmla z22.s, z29.s, z3.s[2]\n"
+ "fmla z26.s, z29.s, z4.s[2]\n"
+ "fmla z11.s, z28.s, z0.s[2]\n"
+ "fmla z15.s, z28.s, z1.s[2]\n"
+ "fmla z19.s, z28.s, z2.s[2]\n"
+ "fmla z23.s, z28.s, z3.s[2]\n"
+ "fmla z27.s, z28.s, z4.s[2]\n"
"ble 68f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x12]\n"
+ "ld1w { z28.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z29.s, z0.s[3]\n"
+ "fmla z12.s, z29.s, z1.s[3]\n"
+ "fmla z16.s, z29.s, z2.s[3]\n"
+ "fmla z20.s, z29.s, z3.s[3]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z29.s, z4.s[3]\n"
+ "fmla z9.s, z28.s, z0.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z13.s, z28.s, z1.s[3]\n"
+ "fmla z17.s, z28.s, z2.s[3]\n"
+ "fmla z21.s, z28.s, z3.s[3]\n"
+ "fmla z25.s, z28.s, z4.s[3]\n"
+ "ld1w { z28.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z10.s, z29.s, z0.s[3]\n"
+ "fmla z14.s, z29.s, z1.s[3]\n"
+ "fmla z18.s, z29.s, z2.s[3]\n"
+ "fmla z22.s, z29.s, z3.s[3]\n"
+ "fmla z26.s, z29.s, z4.s[3]\n"
+ "fmla z11.s, z28.s, z0.s[3]\n"
+ "fmla z15.s, z28.s, z1.s[3]\n"
+ "fmla z19.s, z28.s, z2.s[3]\n"
+ "fmla z23.s, z28.s, z3.s[3]\n"
+ "fmla z27.s, z28.s, z4.s[3]\n"
"68:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1692,49 +1692,49 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 69f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z29.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z23.s, p5/M, z23.s, z1.s\n"
- "fmin z24.s, p5/M, z24.s, z1.s\n"
- "fmin z25.s, p5/M, z25.s, z1.s\n"
- "fmin z26.s, p5/M, z26.s, z1.s\n"
- "fmin z27.s, p5/M, z27.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z23.s, p5/M, z23.s, z0.s\n"
- "fmax z24.s, p5/M, z24.s, z0.s\n"
- "fmax z25.s, p5/M, z25.s, z0.s\n"
- "fmax z26.s, p5/M, z26.s, z0.s\n"
- "fmax z27.s, p5/M, z27.s, z0.s\n"
+ "ld1rw { z28.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z29.s\n"
+ "fmin z9.s, p5/M, z9.s, z29.s\n"
+ "fmin z10.s, p5/M, z10.s, z29.s\n"
+ "fmin z11.s, p5/M, z11.s, z29.s\n"
+ "fmin z12.s, p5/M, z12.s, z29.s\n"
+ "fmin z13.s, p5/M, z13.s, z29.s\n"
+ "fmin z14.s, p5/M, z14.s, z29.s\n"
+ "fmin z15.s, p5/M, z15.s, z29.s\n"
+ "fmin z16.s, p5/M, z16.s, z29.s\n"
+ "fmin z17.s, p5/M, z17.s, z29.s\n"
+ "fmin z18.s, p5/M, z18.s, z29.s\n"
+ "fmin z19.s, p5/M, z19.s, z29.s\n"
+ "fmin z20.s, p5/M, z20.s, z29.s\n"
+ "fmin z21.s, p5/M, z21.s, z29.s\n"
+ "fmin z22.s, p5/M, z22.s, z29.s\n"
+ "fmin z23.s, p5/M, z23.s, z29.s\n"
+ "fmin z24.s, p5/M, z24.s, z29.s\n"
+ "fmin z25.s, p5/M, z25.s, z29.s\n"
+ "fmin z26.s, p5/M, z26.s, z29.s\n"
+ "fmin z27.s, p5/M, z27.s, z29.s\n"
+ "fmax z8.s, p5/M, z8.s, z28.s\n"
+ "fmax z9.s, p5/M, z9.s, z28.s\n"
+ "fmax z10.s, p5/M, z10.s, z28.s\n"
+ "fmax z11.s, p5/M, z11.s, z28.s\n"
+ "fmax z12.s, p5/M, z12.s, z28.s\n"
+ "fmax z13.s, p5/M, z13.s, z28.s\n"
+ "fmax z14.s, p5/M, z14.s, z28.s\n"
+ "fmax z15.s, p5/M, z15.s, z28.s\n"
+ "fmax z16.s, p5/M, z16.s, z28.s\n"
+ "fmax z17.s, p5/M, z17.s, z28.s\n"
+ "fmax z18.s, p5/M, z18.s, z28.s\n"
+ "fmax z19.s, p5/M, z19.s, z28.s\n"
+ "fmax z20.s, p5/M, z20.s, z28.s\n"
+ "fmax z21.s, p5/M, z21.s, z28.s\n"
+ "fmax z22.s, p5/M, z22.s, z28.s\n"
+ "fmax z23.s, p5/M, z23.s, z28.s\n"
+ "fmax z24.s, p5/M, z24.s, z28.s\n"
+ "fmax z25.s, p5/M, z25.s, z28.s\n"
+ "fmax z26.s, p5/M, z26.s, z28.s\n"
+ "fmax z27.s, p5/M, z27.s, z28.s\n"
"69:" // Height 5: No activation
"st1w { z8.s }, p4, [x13]\n"
"st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -1830,35 +1830,35 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"74:" // Height 6: no bias
"tbz %x[flags], #0, 75f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
+ "add x24, x13, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x22]\n"
- "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 76f\n"
"75:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1890,16 +1890,16 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"77:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 78f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 79f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1911,146 +1911,146 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"b 79f\n"
"78:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"79:" // Height 6: input setup done
"cmp x27, #0x4\n"
"ble 81f\n"
"80:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z7.s }, p0/Z, [x26]\n"
+ "ld1rqw { z6.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z5.s }, p0/Z, [x21]\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "ld1rqw { z2.s }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z1.s }, p5/Z, [x12]\n"
+ "ld1w { z0.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z1.s, z7.s[0]\n"
+ "fmla z12.s, z1.s, z6.s[0]\n"
+ "fmla z16.s, z1.s, z5.s[0]\n"
+ "fmla z20.s, z1.s, z4.s[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z28.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z1.s, z3.s[0]\n"
+ "fmla z28.s, z1.s, z2.s[0]\n"
+ "ld1w { z1.s }, p5/Z, [x10]\n"
"add x21, x21, #0x10\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "fmla z29.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z30.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
- "fmla z31.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z28.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "fmla z29.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z30.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
- "fmla z31.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z28.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "fmla z29.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z30.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[0]\n"
+ "fmla z13.s, z0.s, z6.s[0]\n"
+ "fmla z17.s, z0.s, z5.s[0]\n"
+ "fmla z21.s, z0.s, z4.s[0]\n"
+ "fmla z25.s, z0.s, z3.s[0]\n"
+ "fmla z29.s, z0.s, z2.s[0]\n"
+ "ld1w { z0.s }, p5/Z, [x9]\n"
+ "fmla z10.s, z1.s, z7.s[0]\n"
+ "fmla z14.s, z1.s, z6.s[0]\n"
+ "fmla z18.s, z1.s, z5.s[0]\n"
+ "fmla z22.s, z1.s, z4.s[0]\n"
+ "fmla z26.s, z1.s, z3.s[0]\n"
+ "fmla z30.s, z1.s, z2.s[0]\n"
+ "ld1w { z1.s }, p5/Z, [x12, #1, MUL VL]\n"
+ "fmla z11.s, z0.s, z7.s[0]\n"
+ "fmla z15.s, z0.s, z6.s[0]\n"
+ "fmla z19.s, z0.s, z5.s[0]\n"
+ "fmla z23.s, z0.s, z4.s[0]\n"
+ "fmla z27.s, z0.s, z3.s[0]\n"
+ "fmla z31.s, z0.s, z2.s[0]\n"
+ "ld1w { z0.s }, p5/Z, [x11, #1, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[1]\n"
+ "fmla z12.s, z1.s, z6.s[1]\n"
+ "fmla z16.s, z1.s, z5.s[1]\n"
+ "fmla z20.s, z1.s, z4.s[1]\n"
+ "fmla z24.s, z1.s, z3.s[1]\n"
+ "fmla z28.s, z1.s, z2.s[1]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[1]\n"
+ "fmla z13.s, z0.s, z6.s[1]\n"
+ "fmla z17.s, z0.s, z5.s[1]\n"
+ "fmla z21.s, z0.s, z4.s[1]\n"
+ "fmla z25.s, z0.s, z3.s[1]\n"
+ "fmla z29.s, z0.s, z2.s[1]\n"
+ "ld1w { z0.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "fmla z10.s, z1.s, z7.s[1]\n"
+ "fmla z14.s, z1.s, z6.s[1]\n"
+ "fmla z18.s, z1.s, z5.s[1]\n"
+ "fmla z22.s, z1.s, z4.s[1]\n"
+ "fmla z26.s, z1.s, z3.s[1]\n"
+ "fmla z30.s, z1.s, z2.s[1]\n"
+ "ld1w { z1.s }, p5/Z, [x12, #2, MUL VL]\n"
+ "fmla z11.s, z0.s, z7.s[1]\n"
+ "fmla z15.s, z0.s, z6.s[1]\n"
+ "fmla z19.s, z0.s, z5.s[1]\n"
+ "fmla z23.s, z0.s, z4.s[1]\n"
+ "fmla z27.s, z0.s, z3.s[1]\n"
+ "fmla z31.s, z0.s, z2.s[1]\n"
+ "ld1w { z0.s }, p5/Z, [x11, #2, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[2]\n"
+ "fmla z12.s, z1.s, z6.s[2]\n"
+ "fmla z16.s, z1.s, z5.s[2]\n"
+ "fmla z20.s, z1.s, z4.s[2]\n"
+ "fmla z24.s, z1.s, z3.s[2]\n"
+ "fmla z28.s, z1.s, z2.s[2]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[2]\n"
+ "fmla z13.s, z0.s, z6.s[2]\n"
+ "fmla z17.s, z0.s, z5.s[2]\n"
+ "fmla z21.s, z0.s, z4.s[2]\n"
+ "fmla z25.s, z0.s, z3.s[2]\n"
+ "fmla z29.s, z0.s, z2.s[2]\n"
+ "ld1w { z0.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "fmla z10.s, z1.s, z7.s[2]\n"
+ "fmla z14.s, z1.s, z6.s[2]\n"
+ "fmla z18.s, z1.s, z5.s[2]\n"
+ "fmla z22.s, z1.s, z4.s[2]\n"
+ "fmla z26.s, z1.s, z3.s[2]\n"
+ "fmla z30.s, z1.s, z2.s[2]\n"
+ "ld1w { z1.s }, p5/Z, [x12, #3, MUL VL]\n"
"addvl x12, x12, #4\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
- "fmla z31.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+ "fmla z11.s, z0.s, z7.s[2]\n"
+ "fmla z15.s, z0.s, z6.s[2]\n"
+ "fmla z19.s, z0.s, z5.s[2]\n"
+ "fmla z23.s, z0.s, z4.s[2]\n"
+ "fmla z27.s, z0.s, z3.s[2]\n"
+ "fmla z31.s, z0.s, z2.s[2]\n"
+ "ld1w { z0.s }, p5/Z, [x11, #3, MUL VL]\n"
"addvl x11, x11, #4\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z28.s, z6.s, z5.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[3]\n"
+ "fmla z12.s, z1.s, z6.s[3]\n"
+ "fmla z16.s, z1.s, z5.s[3]\n"
+ "fmla z20.s, z1.s, z4.s[3]\n"
+ "fmla z24.s, z1.s, z3.s[3]\n"
+ "fmla z28.s, z1.s, z2.s[3]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "fmla z29.s, z7.s, z5.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[3]\n"
+ "fmla z13.s, z0.s, z6.s[3]\n"
+ "fmla z17.s, z0.s, z5.s[3]\n"
+ "fmla z21.s, z0.s, z4.s[3]\n"
+ "fmla z25.s, z0.s, z3.s[3]\n"
+ "fmla z29.s, z0.s, z2.s[3]\n"
+ "ld1w { z0.s }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z30.s, z6.s, z5.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
- "fmla z31.s, z7.s, z5.s[3]\n"
+ "fmla z10.s, z1.s, z7.s[3]\n"
+ "fmla z14.s, z1.s, z6.s[3]\n"
+ "fmla z18.s, z1.s, z5.s[3]\n"
+ "fmla z22.s, z1.s, z4.s[3]\n"
+ "fmla z26.s, z1.s, z3.s[3]\n"
+ "fmla z30.s, z1.s, z2.s[3]\n"
+ "fmla z11.s, z0.s, z7.s[3]\n"
+ "fmla z15.s, z0.s, z6.s[3]\n"
+ "fmla z19.s, z0.s, z5.s[3]\n"
+ "fmla z23.s, z0.s, z4.s[3]\n"
+ "fmla z27.s, z0.s, z3.s[3]\n"
+ "fmla z31.s, z0.s, z2.s[3]\n"
"bgt 80b\n"
"81:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -2061,139 +2061,139 @@ void sve_ffhybrid_fp32_mla_6x4VL (
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
"ld1rqw { z5.s }, p0/Z, [x21]\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x12]\n"
+ "ld1w { z6.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z7.s, z0.s[0]\n"
+ "fmla z12.s, z7.s, z1.s[0]\n"
+ "fmla z16.s, z7.s, z2.s[0]\n"
+ "fmla z20.s, z7.s, z3.s[0]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z28.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z7.s, z4.s[0]\n"
+ "fmla z28.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "fmla z29.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z9.s, z6.s, z0.s[0]\n"
+ "fmla z13.s, z6.s, z1.s[0]\n"
+ "fmla z17.s, z6.s, z2.s[0]\n"
+ "fmla z21.s, z6.s, z3.s[0]\n"
+ "fmla z25.s, z6.s, z4.s[0]\n"
+ "fmla z29.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z30.s, z6.s, z5.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
- "fmla z31.s, z7.s, z5.s[0]\n"
+ "fmla z10.s, z7.s, z0.s[0]\n"
+ "fmla z14.s, z7.s, z1.s[0]\n"
+ "fmla z18.s, z7.s, z2.s[0]\n"
+ "fmla z22.s, z7.s, z3.s[0]\n"
+ "fmla z26.s, z7.s, z4.s[0]\n"
+ "fmla z30.s, z7.s, z5.s[0]\n"
+ "fmla z11.s, z6.s, z0.s[0]\n"
+ "fmla z15.s, z6.s, z1.s[0]\n"
+ "fmla z19.s, z6.s, z2.s[0]\n"
+ "fmla z23.s, z6.s, z3.s[0]\n"
+ "fmla z27.s, z6.s, z4.s[0]\n"
+ "fmla z31.s, z6.s, z5.s[0]\n"
"ble 82f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x12]\n"
+ "ld1w { z6.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z7.s, z0.s[1]\n"
+ "fmla z12.s, z7.s, z1.s[1]\n"
+ "fmla z16.s, z7.s, z2.s[1]\n"
+ "fmla z20.s, z7.s, z3.s[1]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z28.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z7.s, z4.s[1]\n"
+ "fmla z28.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z9.s, z6.s, z0.s[1]\n"
+ "fmla z13.s, z6.s, z1.s[1]\n"
"addvl x10, x10, #1\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "fmla z29.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z17.s, z6.s, z2.s[1]\n"
+ "fmla z21.s, z6.s, z3.s[1]\n"
+ "fmla z25.s, z6.s, z4.s[1]\n"
+ "fmla z29.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z30.s, z6.s, z5.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
- "fmla z31.s, z7.s, z5.s[1]\n"
+ "fmla z10.s, z7.s, z0.s[1]\n"
+ "fmla z14.s, z7.s, z1.s[1]\n"
+ "fmla z18.s, z7.s, z2.s[1]\n"
+ "fmla z22.s, z7.s, z3.s[1]\n"
+ "fmla z26.s, z7.s, z4.s[1]\n"
+ "fmla z30.s, z7.s, z5.s[1]\n"
+ "fmla z11.s, z6.s, z0.s[1]\n"
+ "fmla z15.s, z6.s, z1.s[1]\n"
+ "fmla z19.s, z6.s, z2.s[1]\n"
+ "fmla z23.s, z6.s, z3.s[1]\n"
+ "fmla z27.s, z6.s, z4.s[1]\n"
+ "fmla z31.s, z6.s, z5.s[1]\n"
"ble 82f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x12]\n"
+ "ld1w { z6.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z7.s, z0.s[2]\n"
+ "fmla z12.s, z7.s, z1.s[2]\n"
+ "fmla z16.s, z7.s, z2.s[2]\n"
+ "fmla z20.s, z7.s, z3.s[2]\n"
"subs x27, x27, #0x1\n"
"addvl x12, x12, #1\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z28.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z7.s, z4.s[2]\n"
+ "fmla z28.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
"addvl x11, x11, #1\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z9.s, z6.s, z0.s[2]\n"
+ "fmla z13.s, z6.s, z1.s[2]\n"
"addvl x10, x10, #1\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "fmla z29.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z17.s, z6.s, z2.s[2]\n"
+ "fmla z21.s, z6.s, z3.s[2]\n"
+ "fmla z25.s, z6.s, z4.s[2]\n"
+ "fmla z29.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z30.s, z6.s, z5.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
- "fmla z31.s, z7.s, z5.s[2]\n"
+ "fmla z10.s, z7.s, z0.s[2]\n"
+ "fmla z14.s, z7.s, z1.s[2]\n"
+ "fmla z18.s, z7.s, z2.s[2]\n"
+ "fmla z22.s, z7.s, z3.s[2]\n"
+ "fmla z26.s, z7.s, z4.s[2]\n"
+ "fmla z30.s, z7.s, z5.s[2]\n"
+ "fmla z11.s, z6.s, z0.s[2]\n"
+ "fmla z15.s, z6.s, z1.s[2]\n"
+ "fmla z19.s, z6.s, z2.s[2]\n"
+ "fmla z23.s, z6.s, z3.s[2]\n"
+ "fmla z27.s, z6.s, z4.s[2]\n"
+ "fmla z31.s, z6.s, z5.s[2]\n"
"ble 82f\n"
- "ld1w { z6.s }, p5/Z, [x12]\n"
- "ld1w { z7.s }, p5/Z, [x11]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x12]\n"
+ "ld1w { z6.s }, p5/Z, [x11]\n"
+ "fmla z8.s, z7.s, z0.s[3]\n"
+ "fmla z12.s, z7.s, z1.s[3]\n"
+ "fmla z16.s, z7.s, z2.s[3]\n"
+ "fmla z20.s, z7.s, z3.s[3]\n"
"addvl x12, x12, #1\n"
"addvl x11, x11, #1\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z28.s, z6.s, z5.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
+ "fmla z24.s, z7.s, z4.s[3]\n"
+ "fmla z28.s, z7.s, z5.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
"addvl x10, x10, #1\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "fmla z29.s, z7.s, z5.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x9]\n"
+ "fmla z9.s, z6.s, z0.s[3]\n"
+ "fmla z13.s, z6.s, z1.s[3]\n"
+ "fmla z17.s, z6.s, z2.s[3]\n"
+ "fmla z21.s, z6.s, z3.s[3]\n"
+ "fmla z25.s, z6.s, z4.s[3]\n"
+ "fmla z29.s, z6.s, z5.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x9]\n"
"addvl x9, x9, #1\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z30.s, z6.s, z5.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
- "fmla z31.s, z7.s, z5.s[3]\n"
+ "fmla z10.s, z7.s, z0.s[3]\n"
+ "fmla z14.s, z7.s, z1.s[3]\n"
+ "fmla z18.s, z7.s, z2.s[3]\n"
+ "fmla z22.s, z7.s, z3.s[3]\n"
+ "fmla z26.s, z7.s, z4.s[3]\n"
+ "fmla z30.s, z7.s, z5.s[3]\n"
+ "fmla z11.s, z6.s, z0.s[3]\n"
+ "fmla z15.s, z6.s, z1.s[3]\n"
+ "fmla z19.s, z6.s, z2.s[3]\n"
+ "fmla z23.s, z6.s, z3.s[3]\n"
+ "fmla z27.s, z6.s, z4.s[3]\n"
+ "fmla z31.s, z6.s, z5.s[3]\n"
"82:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2307,4 +2307,4 @@ void sve_ffhybrid_fp32_mla_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 3ee3e31206..887d78e1de 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
index 36fc9d75ca..57f42cce77 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -174,22 +174,22 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 6f\n"
"4:" // Height 1: no bias
"tbz %x[flags], #0, 5f\n"
- "ld1w { z9.s }, p6/Z, [x13]\n"
- "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x13]\n"
+ "ld1w { z20.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "zip1 z8.d, z21.d, z14.d\n"
+ "zip2 z14.d, z21.d, z14.d\n"
+ "ld1w { z23.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "zip1 z9.d, z20.d, z15.d\n"
+ "zip2 z15.d, z20.d, z15.d\n"
+ "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
+ "zip1 z10.d, z23.d, z16.d\n"
+ "zip2 z16.d, z23.d, z16.d\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "zip1 z12.d, z21.d, z18.d\n"
+ "zip2 z18.d, z21.d, z18.d\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
"b 6f\n"
@@ -211,11 +211,11 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"7:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 8f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 9f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -227,35 +227,35 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"ble 11f\n"
"10:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "ld1rqw { z24.s }, p0/Z, [x24]\n"
+ ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
+ "uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z21.h }, p7/Z, [x12]\n"
+ "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6475e708 // bfmmla z8.s, z24.h, z21.h\n"
+ ".inst 0x6474e70e // bfmmla z14.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x10]\n"
+ "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6475e70a // bfmmla z10.s, z24.h, z21.h\n"
+ ".inst 0x6474e710 // bfmmla z16.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x9]\n"
+ "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ "ld1h { z21.h }, p7/Z, [x27]\n"
+ "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
+ ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
+ ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
+ ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"add x24, x24, #0x10\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
@@ -266,33 +266,33 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"bgt 10b\n"
"11:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ "ld1rqw { z22.s }, p0/Z, [x24]\n"
+ ".inst 0x658abed6 // bfcvt z22.h, p7/M, z22.s\n"
+ "uzp1 z22.h, z22.h, z22.h\n"
+ "ld1h { z21.h }, p7/Z, [x12]\n"
+ "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x6475e6c8 // bfmmla z8.s, z22.h, z21.h\n"
+ ".inst 0x6474e6ce // bfmmla z14.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6475e6c9 // bfmmla z9.s, z22.h, z21.h\n"
+ ".inst 0x6474e6cf // bfmmla z15.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x10]\n"
+ "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6475e6ca // bfmmla z10.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d0 // bfmmla z16.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x9]\n"
+ "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6475e6cb // bfmmla z11.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d1 // bfmmla z17.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e6cc // bfmmla z12.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d2 // bfmmla z18.s, z22.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x27]\n"
+ "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6475e6cd // bfmmla z13.s, z22.h, z21.h\n"
+ ".inst 0x6474e6d3 // bfmmla z19.s, z22.h, z20.h\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
"addvl x10, x10, #2\n"
@@ -312,21 +312,21 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 13f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p7/Z, [x20]\n"
+ "ld1rw { z21.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p7/Z, [x20]\n"
- "fmin z8.s, p7/M, z8.s, z1.s\n"
- "fmin z9.s, p7/M, z9.s, z1.s\n"
- "fmin z10.s, p7/M, z10.s, z1.s\n"
- "fmin z11.s, p7/M, z11.s, z1.s\n"
- "fmin z12.s, p7/M, z12.s, z1.s\n"
- "fmin z13.s, p7/M, z13.s, z1.s\n"
- "fmax z8.s, p7/M, z8.s, z0.s\n"
- "fmax z9.s, p7/M, z9.s, z0.s\n"
- "fmax z10.s, p7/M, z10.s, z0.s\n"
- "fmax z11.s, p7/M, z11.s, z0.s\n"
- "fmax z12.s, p7/M, z12.s, z0.s\n"
- "fmax z13.s, p7/M, z13.s, z0.s\n"
+ "ld1rw { z20.s }, p7/Z, [x20]\n"
+ "fmin z8.s, p7/M, z8.s, z21.s\n"
+ "fmin z9.s, p7/M, z9.s, z21.s\n"
+ "fmin z10.s, p7/M, z10.s, z21.s\n"
+ "fmin z11.s, p7/M, z11.s, z21.s\n"
+ "fmin z12.s, p7/M, z12.s, z21.s\n"
+ "fmin z13.s, p7/M, z13.s, z21.s\n"
+ "fmax z8.s, p7/M, z8.s, z20.s\n"
+ "fmax z9.s, p7/M, z9.s, z20.s\n"
+ "fmax z10.s, p7/M, z10.s, z20.s\n"
+ "fmax z11.s, p7/M, z11.s, z20.s\n"
+ "fmax z12.s, p7/M, z12.s, z20.s\n"
+ "fmax z13.s, p7/M, z13.s, z20.s\n"
"13:" // Height 1: No activation
"st1w { z8.s }, p6, [x13]\n"
"st1w { z9.s }, p5, [x13, #1, MUL VL]\n"
@@ -413,29 +413,29 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"18:" // Height 2: no bias
"tbz %x[flags], #0, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x23, x13, x20, LSL #2\n"
- "ld1w { z9.s }, p6/Z, [x13]\n"
- "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+ "add x20, x13, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x13]\n"
+ "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
- "ld1w { z14.s }, p6/Z, [x23]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
+ "ld1w { z14.s }, p6/Z, [x20]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
+ "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
+ "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
+ "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "zip1 z12.d, z21.d, z18.d\n"
+ "zip2 z18.d, z21.d, z18.d\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
"b 20f\n"
@@ -457,12 +457,12 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"21:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 22f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -470,45 +470,45 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 23f\n"
"22:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"23:" // Height 2: input setup done
"cmp x25, #0x4\n"
"ble 25f\n"
"24:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
+ "ld1rqw { z24.s }, p0/Z, [x24]\n"
+ "ld1rqw { z20.s }, p0/Z, [x23]\n"
+ ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
+ ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
+ "uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x12]\n"
+ "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z20.h, z20.h, z20.h\n"
+ "trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
+ ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x10]\n"
+ "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x9]\n"
+ "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
+ ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x27]\n"
+ "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
+ ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
+ ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
+ ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"addvl x12, x12, #2\n"
@@ -520,39 +520,39 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"bgt 24b\n"
"25:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
+ "ld1rqw { z24.s }, p0/Z, [x24]\n"
+ "ld1rqw { z20.s }, p0/Z, [x23]\n"
+ ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
+ ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
+ "uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x12]\n"
+ "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z20.h, z20.h, z20.h\n"
+ "trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x11]\n"
+ "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
+ ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x10]\n"
+ "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x9]\n"
+ "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
+ ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x27]\n"
+ "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
+ ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
"addvl x12, x12, #2\n"
"addvl x11, x11, #2\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
+ ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"addvl x10, x10, #2\n"
"addvl x9, x9, #2\n"
"addvl x28, x28, #2\n"
@@ -578,33 +578,33 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"uzp2 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 27f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p7/Z, [x20]\n"
+ "ld1rw { z20.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p7/Z, [x20]\n"
- "fmin z4.s, p7/M, z4.s, z1.s\n"
- "fmin z14.s, p7/M, z14.s, z1.s\n"
- "fmin z15.s, p7/M, z15.s, z1.s\n"
- "fmin z16.s, p7/M, z16.s, z1.s\n"
- "fmin z17.s, p7/M, z17.s, z1.s\n"
- "fmin z18.s, p7/M, z18.s, z1.s\n"
- "fmin z8.s, p7/M, z8.s, z1.s\n"
- "fmin z9.s, p7/M, z9.s, z1.s\n"
- "fmin z10.s, p7/M, z10.s, z1.s\n"
- "fmin z11.s, p7/M, z11.s, z1.s\n"
- "fmin z12.s, p7/M, z12.s, z1.s\n"
- "fmin z13.s, p7/M, z13.s, z1.s\n"
- "fmax z4.s, p7/M, z4.s, z0.s\n"
- "fmax z14.s, p7/M, z14.s, z0.s\n"
- "fmax z15.s, p7/M, z15.s, z0.s\n"
- "fmax z16.s, p7/M, z16.s, z0.s\n"
- "fmax z17.s, p7/M, z17.s, z0.s\n"
- "fmax z18.s, p7/M, z18.s, z0.s\n"
- "fmax z8.s, p7/M, z8.s, z0.s\n"
- "fmax z9.s, p7/M, z9.s, z0.s\n"
- "fmax z10.s, p7/M, z10.s, z0.s\n"
- "fmax z11.s, p7/M, z11.s, z0.s\n"
- "fmax z12.s, p7/M, z12.s, z0.s\n"
- "fmax z13.s, p7/M, z13.s, z0.s\n"
+ "ld1rw { z19.s }, p7/Z, [x20]\n"
+ "fmin z4.s, p7/M, z4.s, z20.s\n"
+ "fmin z14.s, p7/M, z14.s, z20.s\n"
+ "fmin z15.s, p7/M, z15.s, z20.s\n"
+ "fmin z16.s, p7/M, z16.s, z20.s\n"
+ "fmin z17.s, p7/M, z17.s, z20.s\n"
+ "fmin z18.s, p7/M, z18.s, z20.s\n"
+ "fmin z8.s, p7/M, z8.s, z20.s\n"
+ "fmin z9.s, p7/M, z9.s, z20.s\n"
+ "fmin z10.s, p7/M, z10.s, z20.s\n"
+ "fmin z11.s, p7/M, z11.s, z20.s\n"
+ "fmin z12.s, p7/M, z12.s, z20.s\n"
+ "fmin z13.s, p7/M, z13.s, z20.s\n"
+ "fmax z4.s, p7/M, z4.s, z19.s\n"
+ "fmax z14.s, p7/M, z14.s, z19.s\n"
+ "fmax z15.s, p7/M, z15.s, z19.s\n"
+ "fmax z16.s, p7/M, z16.s, z19.s\n"
+ "fmax z17.s, p7/M, z17.s, z19.s\n"
+ "fmax z18.s, p7/M, z18.s, z19.s\n"
+ "fmax z8.s, p7/M, z8.s, z19.s\n"
+ "fmax z9.s, p7/M, z9.s, z19.s\n"
+ "fmax z10.s, p7/M, z10.s, z19.s\n"
+ "fmax z11.s, p7/M, z11.s, z19.s\n"
+ "fmax z12.s, p7/M, z12.s, z19.s\n"
+ "fmax z13.s, p7/M, z13.s, z19.s\n"
"27:" // Height 2: No activation
"st1w { z4.s }, p6, [x13]\n"
"st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
@@ -709,38 +709,38 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"32:" // Height 3: no bias
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x23, x13, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z9.s }, p6/Z, [x13]\n"
- "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+ "add x21, x13, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x13]\n"
+ "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
- "ld1w { z14.s }, p6/Z, [x23]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
- "ld1w { z21.s }, p6/Z, [x22]\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
- "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
- "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z14.s }, p6/Z, [x21]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
+ "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
+ "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
+ "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x20]\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
+ "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
- "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
"zip1 z21.d, z22.d, z27.d\n"
@@ -751,8 +751,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"zip2 z29.d, z24.d, z29.d\n"
"zip1 z24.d, z25.d, z30.d\n"
"zip2 z30.d, z25.d, z30.d\n"
- "zip1 z25.d, z4.d, z31.d\n"
- "zip2 z31.d, z4.d, z31.d\n"
+ "zip1 z25.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 34f\n"
"33:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -784,13 +784,13 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"35:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 37f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -799,125 +799,125 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 37f\n"
"36:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"37:" // Height 3: input setup done
"cmp x25, #0x4\n"
"ble 39f\n"
"38:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
"uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
+ "trn1 z5.d, z5.d, z0.d\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
+ ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x10]\n"
"sub x25, x25, #0x4\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
"cmp x25, #0x4\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x9]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
"add x23, x23, #0x10\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
+ ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
"add x22, x22, #0x10\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
+ ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x27]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
"addvl x10, x10, #2\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
+ ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
"addvl x9, x9, #2\n"
"addvl x28, x28, #2\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
+ ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
"addvl x27, x27, #2\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 38b\n"
"39:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
"uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
+ "trn1 z5.d, z5.d, z0.d\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
+ ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x10]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x9]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
"addvl x9, x9, #2\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
+ ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
"addvl x28, x28, #2\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x27]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
"addvl x27, x27, #2\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
+ ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"40:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -946,45 +946,45 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z25.d, z25.d, z31.d\n"
"tbz %x[flags], #1, 41f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p7/Z, [x20]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
"ld1rw { z0.s }, p7/Z, [x20]\n"
- "fmin z4.s, p7/M, z4.s, z1.s\n"
- "fmin z14.s, p7/M, z14.s, z1.s\n"
- "fmin z15.s, p7/M, z15.s, z1.s\n"
- "fmin z16.s, p7/M, z16.s, z1.s\n"
- "fmin z17.s, p7/M, z17.s, z1.s\n"
- "fmin z18.s, p7/M, z18.s, z1.s\n"
- "fmin z8.s, p7/M, z8.s, z1.s\n"
- "fmin z9.s, p7/M, z9.s, z1.s\n"
- "fmin z10.s, p7/M, z10.s, z1.s\n"
- "fmin z11.s, p7/M, z11.s, z1.s\n"
- "fmin z12.s, p7/M, z12.s, z1.s\n"
- "fmin z13.s, p7/M, z13.s, z1.s\n"
- "fmin z20.s, p7/M, z20.s, z1.s\n"
- "fmin z21.s, p7/M, z21.s, z1.s\n"
- "fmin z22.s, p7/M, z22.s, z1.s\n"
- "fmin z23.s, p7/M, z23.s, z1.s\n"
- "fmin z24.s, p7/M, z24.s, z1.s\n"
- "fmin z25.s, p7/M, z25.s, z1.s\n"
- "fmax z4.s, p7/M, z4.s, z0.s\n"
- "fmax z14.s, p7/M, z14.s, z0.s\n"
- "fmax z15.s, p7/M, z15.s, z0.s\n"
- "fmax z16.s, p7/M, z16.s, z0.s\n"
- "fmax z17.s, p7/M, z17.s, z0.s\n"
- "fmax z18.s, p7/M, z18.s, z0.s\n"
- "fmax z8.s, p7/M, z8.s, z0.s\n"
- "fmax z9.s, p7/M, z9.s, z0.s\n"
- "fmax z10.s, p7/M, z10.s, z0.s\n"
- "fmax z11.s, p7/M, z11.s, z0.s\n"
- "fmax z12.s, p7/M, z12.s, z0.s\n"
- "fmax z13.s, p7/M, z13.s, z0.s\n"
- "fmax z20.s, p7/M, z20.s, z0.s\n"
- "fmax z21.s, p7/M, z21.s, z0.s\n"
- "fmax z22.s, p7/M, z22.s, z0.s\n"
- "fmax z23.s, p7/M, z23.s, z0.s\n"
- "fmax z24.s, p7/M, z24.s, z0.s\n"
- "fmax z25.s, p7/M, z25.s, z0.s\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z19.s }, p7/Z, [x20]\n"
+ "fmin z4.s, p7/M, z4.s, z0.s\n"
+ "fmin z14.s, p7/M, z14.s, z0.s\n"
+ "fmin z15.s, p7/M, z15.s, z0.s\n"
+ "fmin z16.s, p7/M, z16.s, z0.s\n"
+ "fmin z17.s, p7/M, z17.s, z0.s\n"
+ "fmin z18.s, p7/M, z18.s, z0.s\n"
+ "fmin z8.s, p7/M, z8.s, z0.s\n"
+ "fmin z9.s, p7/M, z9.s, z0.s\n"
+ "fmin z10.s, p7/M, z10.s, z0.s\n"
+ "fmin z11.s, p7/M, z11.s, z0.s\n"
+ "fmin z12.s, p7/M, z12.s, z0.s\n"
+ "fmin z13.s, p7/M, z13.s, z0.s\n"
+ "fmin z20.s, p7/M, z20.s, z0.s\n"
+ "fmin z21.s, p7/M, z21.s, z0.s\n"
+ "fmin z22.s, p7/M, z22.s, z0.s\n"
+ "fmin z23.s, p7/M, z23.s, z0.s\n"
+ "fmin z24.s, p7/M, z24.s, z0.s\n"
+ "fmin z25.s, p7/M, z25.s, z0.s\n"
+ "fmax z4.s, p7/M, z4.s, z19.s\n"
+ "fmax z14.s, p7/M, z14.s, z19.s\n"
+ "fmax z15.s, p7/M, z15.s, z19.s\n"
+ "fmax z16.s, p7/M, z16.s, z19.s\n"
+ "fmax z17.s, p7/M, z17.s, z19.s\n"
+ "fmax z18.s, p7/M, z18.s, z19.s\n"
+ "fmax z8.s, p7/M, z8.s, z19.s\n"
+ "fmax z9.s, p7/M, z9.s, z19.s\n"
+ "fmax z10.s, p7/M, z10.s, z19.s\n"
+ "fmax z11.s, p7/M, z11.s, z19.s\n"
+ "fmax z12.s, p7/M, z12.s, z19.s\n"
+ "fmax z13.s, p7/M, z13.s, z19.s\n"
+ "fmax z20.s, p7/M, z20.s, z19.s\n"
+ "fmax z21.s, p7/M, z21.s, z19.s\n"
+ "fmax z22.s, p7/M, z22.s, z19.s\n"
+ "fmax z23.s, p7/M, z23.s, z19.s\n"
+ "fmax z24.s, p7/M, z24.s, z19.s\n"
+ "fmax z25.s, p7/M, z25.s, z19.s\n"
"41:" // Height 3: No activation
"st1w { z4.s }, p6, [x13]\n"
"st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
@@ -1098,57 +1098,57 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"46:" // Height 4: no bias
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x23, x13, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z9.s }, p6/Z, [x13]\n"
+ "add x22, x13, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
- "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+ "ld1w { z16.s }, p6/Z, [x13]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
- "ld1w { z14.s }, p6/Z, [x23]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
- "ld1w { z21.s }, p6/Z, [x22]\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
- "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
- "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z14.s }, p6/Z, [x22]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
+ "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
+ "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
+ "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x21]\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
+ "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
- "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
- "ld1w { z26.s }, p6/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z26.s }, p6/Z, [x20]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
- "ld1w { z27.s }, p5/Z, [x21, #1, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
"zip1 z21.d, z22.d, z27.d\n"
"zip2 z27.d, z22.d, z27.d\n"
- "ld1w { z29.s }, p3/Z, [x21, #3, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #4, MUL VL]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z22.d, z23.d, z28.d\n"
"zip2 z28.d, z23.d, z28.d\n"
- "ld1w { z31.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z23.d, z24.d, z29.d\n"
"zip2 z29.d, z24.d, z29.d\n"
"zip1 z24.d, z25.d, z30.d\n"
"zip2 z30.d, z25.d, z30.d\n"
- "zip1 z25.d, z4.d, z31.d\n"
- "zip2 z31.d, z4.d, z31.d\n"
+ "zip1 z25.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 48f\n"
"47:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -1180,14 +1180,14 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"49:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -1197,135 +1197,135 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
"b 51f\n"
"50:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"51:" // Height 4: input setup done
"cmp x25, #0x4\n"
"ble 53f\n"
"52:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "ld1rqw { z3.s }, p0/Z, [x21]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
"sub x25, x25, #0x4\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x10]\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
"cmp x25, #0x4\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x9]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
"add x23, x23, #0x10\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
"add x22, x22, #0x10\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
"add x21, x21, #0x10\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x27]\n"
"addvl x12, x12, #2\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x11, x11, #2\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
"addvl x10, x10, #2\n"
"addvl x9, x9, #2\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
"addvl x28, x28, #2\n"
"addvl x27, x27, #2\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"bgt 52b\n"
"53:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "ld1rqw { z3.s }, p0/Z, [x21]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x12]\n"
- "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z6.h }, p7/Z, [x11]\n"
- "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x12]\n"
+ "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z1.h }, p7/Z, [x11]\n"
+ "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
"addvl x12, x12, #2\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x10]\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x10]\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
"addvl x11, x11, #2\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x9]\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x9]\n"
"addvl x10, x10, #2\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
"addvl x9, x9, #2\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
"addvl x28, x28, #2\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x27]\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x27]\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x27, x27, #2\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"54:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -1461,4 +1461,4 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
index 5792a7152d..d0ef531c33 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 7649336c36..576bd47039 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -53,33 +53,33 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
__asm__ __volatile__(
"ptrue p0.b\n"
"1:" // Height loop
- "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x24, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
- "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cntw x23, ALL, MUL #2\n"
- "add x22, x26, x20, LSL #1\n"
+ "add x22, x24, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
- "cmp x25, x23\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov %x[Apanel], x24\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"decw x23\n"
- "cmp x25, x23\n"
- "mov x21, x26\n"
+ "cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
- "mov x22, x26\n"
+ "mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "ld1h { z4.h }, p0/Z, [x26]\n"
+ "ld1h { z4.h }, p0/Z, [x24]\n"
"mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
@@ -88,13 +88,13 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
- "ld1h { z5.h }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z5.h }, p0/Z, [x24, #1, MUL VL]\n"
"mov z17.b, #0x0\n"
"mov z18.b, #0x0\n"
"ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
"mov z19.b, #0x0\n"
"mov z20.b, #0x0\n"
- "addvl x26, x26, #2\n"
+ "addvl x24, x24, #2\n"
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
"add %x[Apanel], %x[Apanel], #0x30\n"
@@ -109,83 +109,83 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
"mov z31.b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
- "ld1h { z6.h }, p0/Z, [x22]\n"
+ "ld1h { z7.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x21]\n"
- "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6464e4da // bfmmla z26.s, z6.h, z4.h\n"
+ ".inst 0x6465e4dd // bfmmla z29.s, z6.h, z5.h\n"
+ "ld1h { z5.h }, p0/Z, [x21]\n"
+ "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
+ ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
+ ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
"sub x20, x20, #0x2\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
+ ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
"cmp x20, #0x2\n"
- ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n"
- ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n"
- "ld1h { z6.h }, p0/Z, [x26]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
+ ".inst 0x6467e4db // bfmmla z27.s, z6.h, z7.h\n"
+ ".inst 0x6463e4de // bfmmla z30.s, z6.h, z3.h\n"
+ "ld1h { z3.h }, p0/Z, [x24]\n"
+ ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
+ ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
- ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
+ ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
+ ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
- ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
+ ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
+ ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
+ "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x6465e4dc // bfmmla z28.s, z6.h, z5.h\n"
+ ".inst 0x6464e4df // bfmmla z31.s, z6.h, z4.h\n"
+ "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
+ ".inst 0x6463e408 // bfmmla z8.s, z0.h, z3.h\n"
".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
+ ".inst 0x6463e42e // bfmmla z14.s, z1.h, z3.h\n"
".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
- ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
- "ld1h { z6.h }, p0/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b7 // bfmmla z23.s, z5.h, z7.h\n"
+ "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x6463e4da // bfmmla z26.s, z6.h, z3.h\n"
+ ".inst 0x6467e4dd // bfmmla z29.s, z6.h, z7.h\n"
+ "ld1h { z3.h }, p0/Z, [x21, #2, MUL VL]\n"
"ld1h { z7.h }, p0/Z, [x21, #3, MUL VL]\n"
- ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n"
- ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n"
- ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n"
- ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n"
+ ".inst 0x6462e409 // bfmmla z9.s, z0.h, z2.h\n"
+ ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6462e42f // bfmmla z15.s, z1.h, z2.h\n"
+ ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
"addvl x22, x22, #4\n"
- ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n"
- ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n"
+ ".inst 0x6462e4b5 // bfmmla z21.s, z5.h, z2.h\n"
+ ".inst 0x6464e4b8 // bfmmla z24.s, z5.h, z4.h\n"
"addvl x21, x21, #4\n"
- ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n"
- ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x26, #2, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
+ ".inst 0x6462e4db // bfmmla z27.s, z6.h, z2.h\n"
+ ".inst 0x6464e4de // bfmmla z30.s, z6.h, z4.h\n"
+ "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x6463e40a // bfmmla z10.s, z0.h, z3.h\n"
".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
- ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
+ ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n"
".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
- "ld1h { z5.h }, p0/Z, [x26, #3, MUL VL]\n"
- ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
- ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
+ "ld1h { z5.h }, p0/Z, [x24, #3, MUL VL]\n"
+ ".inst 0x6463e4dc // bfmmla z28.s, z6.h, z3.h\n"
+ ".inst 0x6467e4df // bfmmla z31.s, z6.h, z7.h\n"
"ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
- "addvl x26, x26, #4\n"
+ "addvl x24, x24, #4\n"
"bge 4b\n"
"5:" // main loop skip
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
@@ -193,115 +193,115 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
"ld1h { z6.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x21]\n"
- "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6464e4fa // bfmmla z26.s, z7.h, z4.h\n"
+ ".inst 0x6465e4fd // bfmmla z29.s, z7.h, z5.h\n"
+ "ld1h { z5.h }, p0/Z, [x21]\n"
+ "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
"addvl x22, x22, #2\n"
- ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n"
- ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n"
+ ".inst 0x6466e4fb // bfmmla z27.s, z7.h, z6.h\n"
+ ".inst 0x6463e4fe // bfmmla z30.s, z7.h, z3.h\n"
"addvl x21, x21, #2\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
- ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
- ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
- ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
- ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
+ ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
+ ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
+ ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
+ ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
+ ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
+ ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
+ ".inst 0x6465e4fc // bfmmla z28.s, z7.h, z5.h\n"
+ ".inst 0x6464e4ff // bfmmla z31.s, z7.h, z4.h\n"
"cbz x20, 6f\n"
- "ld1h { z6.h }, p0/Z, [x26]\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
- "ld1h { z7.h }, p0/Z, [x26, #1, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z4.h }, p0/Z, [x22]\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
- "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
- "ld1h { z6.h }, p0/Z, [x21]\n"
- "ld1h { z7.h }, p0/Z, [x21, #1, MUL VL]\n"
- ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n"
- ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n"
+ "ld1h { z1.h }, p0/Z, [x24]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
+ "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1h { z0.h }, p0/Z, [x24, #1, MUL VL]\n"
+ ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n"
+ "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
+ ".inst 0x6461e4ce // bfmmla z14.s, z6.h, z1.h\n"
+ ".inst 0x6460e4d1 // bfmmla z17.s, z6.h, z0.h\n"
+ ".inst 0x6461e4b4 // bfmmla z20.s, z5.h, z1.h\n"
+ "ld1h { z3.h }, p0/Z, [x22]\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
+ "ld1h { z2.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z1.h }, p0/Z, [x21]\n"
+ "ld1h { z0.h }, p0/Z, [x21, #1, MUL VL]\n"
+ ".inst 0x6463e4e9 // bfmmla z9.s, z7.h, z3.h\n"
+ ".inst 0x6462e4ec // bfmmla z12.s, z7.h, z2.h\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n"
- ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n"
- ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n"
- ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n"
- ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n"
- ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
- ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
- ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
- ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
+ ".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n"
+ ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
+ ".inst 0x6463e4b5 // bfmmla z21.s, z5.h, z3.h\n"
+ ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
+ ".inst 0x6463e49b // bfmmla z27.s, z4.h, z3.h\n"
+ ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6461e4d0 // bfmmla z16.s, z6.h, z1.h\n"
+ ".inst 0x6460e4d3 // bfmmla z19.s, z6.h, z0.h\n"
+ ".inst 0x6461e4b6 // bfmmla z22.s, z5.h, z1.h\n"
+ ".inst 0x6460e4b9 // bfmmla z25.s, z5.h, z0.h\n"
+ ".inst 0x6461e49c // bfmmla z28.s, z4.h, z1.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"6:" // multiply loop done
- "decw x25, ALL, MUL #3\n"
- "uzp1 z4.d, z8.d, z11.d\n"
+ "decw x26, ALL, MUL #3\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "uzp1 z11.d, z9.d, z12.d\n"
+ "uzp1 z1.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
- "st1w { z4.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z12.d, z10.d, z13.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "uzp1 z13.d, z14.d, z17.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "uzp1 z2.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
"st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
- "cmp x25, XZR\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
+ "cmp x26, XZR\n"
"st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "uzp1 z18.d, z16.d, z19.d\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
"st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "uzp1 z19.d, z20.d, z23.d\n"
- "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z2.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z20.d, z20.d, z23.d\n"
"uzp1 z23.d, z21.d, z24.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
"addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z21.d, z21.d, z24.d\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
- "uzp1 z24.d, z22.d, z25.d\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
"st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "uzp1 z25.d, z26.d, z29.d\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
"st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "uzp1 z29.d, z27.d, z30.d\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
"st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "uzp1 z30.d, z28.d, z31.d\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
"st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
index 6d36bf8bbf..60f1b699c3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
index 1d502f5354..69ddb21c31 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -52,33 +52,33 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
__asm__ __volatile__(
"ptrue p0.b\n"
"1:" // Height loop
- "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x24, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
- "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cnth x23, ALL, MUL #2\n"
- "add x22, x26, x20, LSL #1\n"
+ "add x22, x24, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
- "cmp x25, x23\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov %x[Apanel], x24\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"dech x23\n"
- "cmp x25, x23\n"
- "mov x21, x26\n"
+ "cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
- "mov x22, x26\n"
+ "mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "ld1h { z0.h }, p0/Z, [x26]\n"
+ "ld1h { z0.h }, p0/Z, [x24]\n"
"mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
"ld1h { z1.h }, p0/Z, [x22]\n"
@@ -116,12 +116,12 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z11.h, p0/M, z0.h, z4.h\n"
"fmla z12.h, p0/M, z1.h, z4.h\n"
"fmla z13.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #10]\n"
"fmla z14.h, p0/M, z0.h, z5.h\n"
"fmla z15.h, p0/M, z1.h, z5.h\n"
"cmp x20, #0x2\n"
"fmla z16.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #12]\n"
"fmla z17.h, p0/M, z0.h, z6.h\n"
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
@@ -130,57 +130,57 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z21.h, p0/M, z1.h, z3.h\n"
"fmla z22.h, p0/M, z2.h, z3.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
- "fmla z23.h, p0/M, z0.h, z4.h\n"
- "fmla z24.h, p0/M, z1.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
- "fmla z26.h, p0/M, z0.h, z5.h\n"
- "fmla z27.h, p0/M, z1.h, z5.h\n"
- "fmla z28.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n"
+ "fmla z23.h, p0/M, z0.h, z7.h\n"
+ "fmla z24.h, p0/M, z1.h, z7.h\n"
+ "fmla z25.h, p0/M, z2.h, z7.h\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #18]\n"
+ "fmla z26.h, p0/M, z0.h, z4.h\n"
+ "fmla z27.h, p0/M, z1.h, z4.h\n"
+ "fmla z28.h, p0/M, z2.h, z4.h\n"
+ "ld1rh { z4.h }, p0/Z, [%x[Apanel], #20]\n"
"fmla z29.h, p0/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
"fmla z30.h, p0/M, z1.h, z6.h\n"
"fmla z31.h, p0/M, z2.h, z6.h\n"
- "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
"ld1h { z2.h }, p0/Z, [x21, #1, MUL VL]\n"
- "fmla z8.h, p0/M, z0.h, z3.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n"
- "fmla z9.h, p0/M, z1.h, z3.h\n"
+ "fmla z8.h, p0/M, z7.h, z3.h\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
+ "fmla z9.h, p0/M, z6.h, z3.h\n"
"fmla z10.h, p0/M, z2.h, z3.h\n"
- "fmla z11.h, p0/M, z0.h, z4.h\n"
+ "fmla z11.h, p0/M, z7.h, z5.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
- "fmla z12.h, p0/M, z1.h, z4.h\n"
- "fmla z13.h, p0/M, z2.h, z4.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
- "fmla z14.h, p0/M, z0.h, z5.h\n"
- "fmla z15.h, p0/M, z1.h, z5.h\n"
- "addvl x26, x26, #2\n"
- "fmla z16.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z17.h, p0/M, z0.h, z6.h\n"
- "fmla z18.h, p0/M, z1.h, z6.h\n"
- "fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n"
+ "fmla z12.h, p0/M, z6.h, z5.h\n"
+ "fmla z13.h, p0/M, z2.h, z5.h\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #26]\n"
+ "fmla z14.h, p0/M, z7.h, z4.h\n"
+ "fmla z15.h, p0/M, z6.h, z4.h\n"
+ "addvl x24, x24, #2\n"
+ "fmla z16.h, p0/M, z2.h, z4.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z17.h, p0/M, z7.h, z1.h\n"
+ "fmla z18.h, p0/M, z6.h, z1.h\n"
+ "fmla z19.h, p0/M, z2.h, z1.h\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
"addvl x22, x22, #2\n"
"addvl x21, x21, #2\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
+ "fmla z20.h, p0/M, z7.h, z3.h\n"
+ "fmla z21.h, p0/M, z6.h, z3.h\n"
"fmla z22.h, p0/M, z2.h, z3.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z23.h, p0/M, z0.h, z4.h\n"
- "fmla z24.h, p0/M, z1.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
- "fmla z26.h, p0/M, z0.h, z5.h\n"
+ "fmla z23.h, p0/M, z7.h, z5.h\n"
+ "fmla z24.h, p0/M, z6.h, z5.h\n"
+ "fmla z25.h, p0/M, z2.h, z5.h\n"
+ "fmla z26.h, p0/M, z7.h, z0.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
- "fmla z27.h, p0/M, z1.h, z5.h\n"
- "fmla z28.h, p0/M, z2.h, z5.h\n"
+ "fmla z27.h, p0/M, z6.h, z0.h\n"
+ "fmla z28.h, p0/M, z2.h, z0.h\n"
"ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p0/Z, [x26]\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
+ "fmla z29.h, p0/M, z7.h, z1.h\n"
+ "ld1h { z0.h }, p0/Z, [x24]\n"
+ "fmla z30.h, p0/M, z6.h, z1.h\n"
+ "fmla z31.h, p0/M, z2.h, z1.h\n"
"ld1h { z1.h }, p0/Z, [x22]\n"
"ld1h { z2.h }, p0/Z, [x21]\n"
"ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
@@ -188,9 +188,9 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"5:" // main loop skip
"fmla z8.h, p0/M, z0.h, z3.h\n"
"fmla z9.h, p0/M, z1.h, z3.h\n"
- "addvl x26, x26, #1\n"
+ "addvl x24, x24, #1\n"
"fmla z10.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z11.h, p0/M, z0.h, z4.h\n"
"fmla z12.h, p0/M, z1.h, z4.h\n"
"fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -203,11 +203,11 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z17.h, p0/M, z0.h, z6.h\n"
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
+ "fmla z20.h, p0/M, z0.h, z7.h\n"
+ "fmla z21.h, p0/M, z1.h, z7.h\n"
"addvl x21, x21, #1\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
+ "fmla z22.h, p0/M, z2.h, z7.h\n"
"fmla z23.h, p0/M, z0.h, z4.h\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z24.h, p0/M, z1.h, z4.h\n"
@@ -215,50 +215,50 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"fmla z26.h, p0/M, z0.h, z5.h\n"
"fmla z27.h, p0/M, z1.h, z5.h\n"
"fmla z28.h, p0/M, z2.h, z5.h\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
+ "fmla z29.h, p0/M, z0.h, z3.h\n"
+ "fmla z30.h, p0/M, z1.h, z3.h\n"
+ "fmla z31.h, p0/M, z2.h, z3.h\n"
"cbz x20, 6f\n"
- "ld1h { z0.h }, p0/Z, [x26]\n"
- "ld1h { z1.h }, p0/Z, [x22]\n"
- "ld1h { z2.h }, p0/Z, [x21]\n"
+ "ld1h { z6.h }, p0/Z, [x24]\n"
+ "ld1h { z5.h }, p0/Z, [x22]\n"
+ "ld1h { z4.h }, p0/Z, [x21]\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z8.h, p0/M, z0.h, z3.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
- "fmla z9.h, p0/M, z1.h, z3.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
- "fmla z10.h, p0/M, z2.h, z3.h\n"
- "fmla z11.h, p0/M, z0.h, z4.h\n"
- "fmla z12.h, p0/M, z1.h, z4.h\n"
- "fmla z13.h, p0/M, z2.h, z4.h\n"
+ "fmla z8.h, p0/M, z6.h, z3.h\n"
+ "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
+ "fmla z9.h, p0/M, z5.h, z3.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
+ "fmla z10.h, p0/M, z4.h, z3.h\n"
+ "fmla z11.h, p0/M, z6.h, z2.h\n"
+ "fmla z12.h, p0/M, z5.h, z2.h\n"
+ "fmla z13.h, p0/M, z4.h, z2.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
- "fmla z14.h, p0/M, z0.h, z5.h\n"
- "fmla z15.h, p0/M, z1.h, z5.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
- "fmla z16.h, p0/M, z2.h, z5.h\n"
- "fmla z17.h, p0/M, z0.h, z6.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
- "fmla z18.h, p0/M, z1.h, z6.h\n"
- "fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
+ "fmla z14.h, p0/M, z6.h, z1.h\n"
+ "fmla z15.h, p0/M, z5.h, z1.h\n"
+ "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+ "fmla z16.h, p0/M, z4.h, z1.h\n"
+ "fmla z17.h, p0/M, z6.h, z0.h\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
+ "fmla z18.h, p0/M, z5.h, z0.h\n"
+ "fmla z19.h, p0/M, z4.h, z0.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
+ "fmla z20.h, p0/M, z6.h, z3.h\n"
+ "fmla z21.h, p0/M, z5.h, z3.h\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
- "fmla z23.h, p0/M, z0.h, z4.h\n"
- "fmla z24.h, p0/M, z1.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
- "fmla z26.h, p0/M, z0.h, z5.h\n"
- "fmla z27.h, p0/M, z1.h, z5.h\n"
- "fmla z28.h, p0/M, z2.h, z5.h\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
+ "fmla z22.h, p0/M, z4.h, z3.h\n"
+ "fmla z23.h, p0/M, z6.h, z2.h\n"
+ "fmla z24.h, p0/M, z5.h, z2.h\n"
+ "fmla z25.h, p0/M, z4.h, z2.h\n"
+ "fmla z26.h, p0/M, z6.h, z1.h\n"
+ "fmla z27.h, p0/M, z5.h, z1.h\n"
+ "fmla z28.h, p0/M, z4.h, z1.h\n"
+ "fmla z29.h, p0/M, z6.h, z0.h\n"
+ "fmla z30.h, p0/M, z5.h, z0.h\n"
+ "fmla z31.h, p0/M, z4.h, z0.h\n"
"6:" // multiply loop done
- "dech x25, ALL, MUL #3\n"
+ "dech x26, ALL, MUL #3\n"
"st1h { z8.h }, p0, [%x[Cpanel]]\n"
- "cmp x25, XZR\n"
+ "cmp x26, XZR\n"
"st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
@@ -289,7 +289,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
"bne 1b\n"
: [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
: [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
index de219aa2bf..23503fa108 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
@@ -52,26 +52,26 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
__asm__ __volatile__(
"ptrue p0.b\n"
"1:" // Height loop
- "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x24, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
- "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cnth x23, ALL, MUL #2\n"
- "add x22, x26, x20, LSL #1\n"
+ "add x22, x24, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
- "cmp x25, x23\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov %x[Apanel], x24\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"dech x23\n"
- "cmp x25, x23\n"
- "mov x21, x26\n"
+ "cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
- "mov x22, x26\n"
+ "mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"cmp x20, #0x2\n"
@@ -81,7 +81,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
"mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
- "ld1h { z2.h }, p0/Z, [x26]\n"
+ "ld1h { z2.h }, p0/Z, [x24]\n"
"mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
"ld1h { z3.h }, p0/Z, [x22]\n"
@@ -107,19 +107,19 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"4:" // main loop head
"fmla z8.h, z2.h, z0.h[0]\n"
"fmla z11.h, z2.h, z0.h[1]\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
"fmla z14.h, z2.h, z0.h[2]\n"
"fmla z17.h, z2.h, z0.h[3]\n"
- "ld1h { z5.h }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1h { z6.h }, p0/Z, [x24, #1, MUL VL]\n"
"fmla z20.h, z2.h, z0.h[4]\n"
"fmla z23.h, z2.h, z0.h[5]\n"
- "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
"fmla z26.h, z2.h, z0.h[6]\n"
"fmla z29.h, z2.h, z0.h[7]\n"
- "ld1h { z7.h }, p0/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z1.h }, p0/Z, [x21, #1, MUL VL]\n"
"fmla z9.h, z3.h, z0.h[0]\n"
"fmla z12.h, z3.h, z0.h[1]\n"
- "addvl x26, x26, #2\n"
+ "addvl x24, x24, #2\n"
"fmla z15.h, z3.h, z0.h[2]\n"
"fmla z18.h, z3.h, z0.h[3]\n"
"addvl x22, x22, #2\n"
@@ -137,36 +137,36 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z22.h, z4.h, z0.h[4]\n"
"fmla z25.h, z4.h, z0.h[5]\n"
- "ld1h { z2.h }, p0/Z, [x26]\n"
+ "ld1h { z2.h }, p0/Z, [x24]\n"
"fmla z28.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z8.h, z5.h, z1.h[0]\n"
- "fmla z11.h, z5.h, z1.h[1]\n"
+ "fmla z8.h, z6.h, z7.h[0]\n"
+ "fmla z11.h, z6.h, z7.h[1]\n"
"ld1h { z3.h }, p0/Z, [x22]\n"
- "fmla z14.h, z5.h, z1.h[2]\n"
- "fmla z17.h, z5.h, z1.h[3]\n"
+ "fmla z14.h, z6.h, z7.h[2]\n"
+ "fmla z17.h, z6.h, z7.h[3]\n"
"ld1h { z4.h }, p0/Z, [x21]\n"
- "fmla z20.h, z5.h, z1.h[4]\n"
- "fmla z23.h, z5.h, z1.h[5]\n"
- "fmla z26.h, z5.h, z1.h[6]\n"
- "fmla z29.h, z5.h, z1.h[7]\n"
- "fmla z9.h, z6.h, z1.h[0]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z15.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z1.h[3]\n"
- "fmla z21.h, z6.h, z1.h[4]\n"
- "fmla z24.h, z6.h, z1.h[5]\n"
- "fmla z27.h, z6.h, z1.h[6]\n"
- "fmla z30.h, z6.h, z1.h[7]\n"
- "fmla z10.h, z7.h, z1.h[0]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z16.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z1.h[3]\n"
- "fmla z22.h, z7.h, z1.h[4]\n"
- "fmla z25.h, z7.h, z1.h[5]\n"
- "fmla z28.h, z7.h, z1.h[6]\n"
- "fmla z31.h, z7.h, z1.h[7]\n"
+ "fmla z20.h, z6.h, z7.h[4]\n"
+ "fmla z23.h, z6.h, z7.h[5]\n"
+ "fmla z26.h, z6.h, z7.h[6]\n"
+ "fmla z29.h, z6.h, z7.h[7]\n"
+ "fmla z9.h, z5.h, z7.h[0]\n"
+ "fmla z12.h, z5.h, z7.h[1]\n"
+ "fmla z15.h, z5.h, z7.h[2]\n"
+ "fmla z18.h, z5.h, z7.h[3]\n"
+ "fmla z21.h, z5.h, z7.h[4]\n"
+ "fmla z24.h, z5.h, z7.h[5]\n"
+ "fmla z27.h, z5.h, z7.h[6]\n"
+ "fmla z30.h, z5.h, z7.h[7]\n"
+ "fmla z10.h, z1.h, z7.h[0]\n"
+ "fmla z13.h, z1.h, z7.h[1]\n"
+ "fmla z16.h, z1.h, z7.h[2]\n"
+ "fmla z19.h, z1.h, z7.h[3]\n"
+ "fmla z22.h, z1.h, z7.h[4]\n"
+ "fmla z25.h, z1.h, z7.h[5]\n"
+ "fmla z28.h, z1.h, z7.h[6]\n"
+ "fmla z31.h, z1.h, z7.h[7]\n"
"bge 4b\n"
"5:" // main loop skip
"fmla z8.h, z2.h, z0.h[0]\n"
@@ -174,7 +174,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"add %x[Apanel], %x[Apanel], #0x10\n"
"fmla z14.h, z2.h, z0.h[2]\n"
"fmla z17.h, z2.h, z0.h[3]\n"
- "addvl x26, x26, #1\n"
+ "addvl x24, x24, #1\n"
"fmla z20.h, z2.h, z0.h[4]\n"
"fmla z23.h, z2.h, z0.h[5]\n"
"addvl x22, x22, #1\n"
@@ -198,39 +198,39 @@ void sve_ffinterleaved_fp16_mla_8x3VL(
"fmla z28.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"cbz x20, 6f\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "ld1h { z5.h }, p0/Z, [x26]\n"
- "fmla z8.h, z5.h, z0.h[0]\n"
- "ld1h { z6.h }, p0/Z, [x22]\n"
- "ld1h { z7.h }, p0/Z, [x21]\n"
- "fmla z11.h, z5.h, z0.h[1]\n"
- "fmla z14.h, z5.h, z0.h[2]\n"
- "fmla z17.h, z5.h, z0.h[3]\n"
+ "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1h { z2.h }, p0/Z, [x24]\n"
+ "fmla z8.h, z2.h, z3.h[0]\n"
+ "ld1h { z1.h }, p0/Z, [x22]\n"
+ "ld1h { z0.h }, p0/Z, [x21]\n"
+ "fmla z11.h, z2.h, z3.h[1]\n"
+ "fmla z14.h, z2.h, z3.h[2]\n"
+ "fmla z17.h, z2.h, z3.h[3]\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla z20.h, z5.h, z0.h[4]\n"
- "fmla z23.h, z5.h, z0.h[5]\n"
- "fmla z26.h, z5.h, z0.h[6]\n"
- "fmla z29.h, z5.h, z0.h[7]\n"
- "fmla z9.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z0.h[1]\n"
- "fmla z15.h, z6.h, z0.h[2]\n"
- "fmla z18.h, z6.h, z0.h[3]\n"
- "fmla z21.h, z6.h, z0.h[4]\n"
- "fmla z24.h, z6.h, z0.h[5]\n"
- "fmla z27.h, z6.h, z0.h[6]\n"
- "fmla z30.h, z6.h, z0.h[7]\n"
- "fmla z10.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z0.h[1]\n"
- "fmla z16.h, z7.h, z0.h[2]\n"
- "fmla z19.h, z7.h, z0.h[3]\n"
- "fmla z22.h, z7.h, z0.h[4]\n"
- "fmla z25.h, z7.h, z0.h[5]\n"
- "fmla z28.h, z7.h, z0.h[6]\n"
- "fmla z31.h, z7.h, z0.h[7]\n"
+ "fmla z20.h, z2.h, z3.h[4]\n"
+ "fmla z23.h, z2.h, z3.h[5]\n"
+ "fmla z26.h, z2.h, z3.h[6]\n"
+ "fmla z29.h, z2.h, z3.h[7]\n"
+ "fmla z9.h, z1.h, z3.h[0]\n"
+ "fmla z12.h, z1.h, z3.h[1]\n"
+ "fmla z15.h, z1.h, z3.h[2]\n"
+ "fmla z18.h, z1.h, z3.h[3]\n"
+ "fmla z21.h, z1.h, z3.h[4]\n"
+ "fmla z24.h, z1.h, z3.h[5]\n"
+ "fmla z27.h, z1.h, z3.h[6]\n"
+ "fmla z30.h, z1.h, z3.h[7]\n"
+ "fmla z10.h, z0.h, z3.h[0]\n"
+ "fmla z13.h, z0.h, z3.h[1]\n"
+ "fmla z16.h, z0.h, z3.h[2]\n"
+ "fmla z19.h, z0.h, z3.h[3]\n"
+ "fmla z22.h, z0.h, z3.h[4]\n"
+ "fmla z25.h, z0.h, z3.h[5]\n"
+ "fmla z28.h, z0.h, z3.h[6]\n"
+ "fmla z31.h, z0.h, z3.h[7]\n"
"6:" // multiply loop done
- "dech x25, ALL, MUL #3\n"
+ "dech x26, ALL, MUL #3\n"
"st1h { z8.h }, p0, [%x[Cpanel]]\n"
- "cmp x25, XZR\n"
+ "cmp x26, XZR\n"
"st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
index aa3507ee73..ac6986913d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
index 8c8b6b0675..c65c3a3ce4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -52,33 +52,33 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
__asm__ __volatile__(
"ptrue p0.b\n"
"1:" // Height loop
- "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x24, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
- "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cntw x23, ALL, MUL #2\n"
- "add x22, x26, x20, LSL #2\n"
+ "add x22, x24, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
- "cmp x25, x23\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov %x[Apanel], x24\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"decw x23\n"
- "cmp x25, x23\n"
- "mov x21, x26\n"
+ "cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
- "mov x22, x26\n"
+ "mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"cmp x20, #0x2\n"
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
- "ld1w { z0.s }, p0/Z, [x26]\n"
+ "ld1w { z0.s }, p0/Z, [x24]\n"
"mov z11.b, #0x0\n"
"mov z12.b, #0x0\n"
"ld1w { z1.s }, p0/Z, [x22]\n"
@@ -116,12 +116,12 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z11.s, p0/M, z0.s, z4.s\n"
"fmla z12.s, p0/M, z1.s, z4.s\n"
"fmla z13.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #20]\n"
"fmla z14.s, p0/M, z0.s, z5.s\n"
"fmla z15.s, p0/M, z1.s, z5.s\n"
"cmp x20, #0x2\n"
"fmla z16.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #24]\n"
"fmla z17.s, p0/M, z0.s, z6.s\n"
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
@@ -130,57 +130,57 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z21.s, p0/M, z1.s, z3.s\n"
"fmla z22.s, p0/M, z2.s, z3.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
- "fmla z23.s, p0/M, z0.s, z4.s\n"
- "fmla z24.s, p0/M, z1.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "fmla z26.s, p0/M, z0.s, z5.s\n"
- "fmla z27.s, p0/M, z1.s, z5.s\n"
- "fmla z28.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
+ "fmla z23.s, p0/M, z0.s, z7.s\n"
+ "fmla z24.s, p0/M, z1.s, z7.s\n"
+ "fmla z25.s, p0/M, z2.s, z7.s\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #36]\n"
+ "fmla z26.s, p0/M, z0.s, z4.s\n"
+ "fmla z27.s, p0/M, z1.s, z4.s\n"
+ "fmla z28.s, p0/M, z2.s, z4.s\n"
+ "ld1rw { z4.s }, p0/Z, [%x[Apanel], #40]\n"
"fmla z29.s, p0/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z7.s }, p0/Z, [x24, #1, MUL VL]\n"
"fmla z30.s, p0/M, z1.s, z6.s\n"
"fmla z31.s, p0/M, z2.s, z6.s\n"
- "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z6.s }, p0/Z, [x22, #1, MUL VL]\n"
"ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
- "fmla z8.s, p0/M, z0.s, z3.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
- "fmla z9.s, p0/M, z1.s, z3.s\n"
+ "fmla z8.s, p0/M, z7.s, z3.s\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+ "fmla z9.s, p0/M, z6.s, z3.s\n"
"fmla z10.s, p0/M, z2.s, z3.s\n"
- "fmla z11.s, p0/M, z0.s, z4.s\n"
+ "fmla z11.s, p0/M, z7.s, z5.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
- "fmla z12.s, p0/M, z1.s, z4.s\n"
- "fmla z13.s, p0/M, z2.s, z4.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "fmla z14.s, p0/M, z0.s, z5.s\n"
- "fmla z15.s, p0/M, z1.s, z5.s\n"
- "addvl x26, x26, #2\n"
- "fmla z16.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
- "fmla z17.s, p0/M, z0.s, z6.s\n"
- "fmla z18.s, p0/M, z1.s, z6.s\n"
- "fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+ "fmla z12.s, p0/M, z6.s, z5.s\n"
+ "fmla z13.s, p0/M, z2.s, z5.s\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #52]\n"
+ "fmla z14.s, p0/M, z7.s, z4.s\n"
+ "fmla z15.s, p0/M, z6.s, z4.s\n"
+ "addvl x24, x24, #2\n"
+ "fmla z16.s, p0/M, z2.s, z4.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+ "fmla z17.s, p0/M, z7.s, z1.s\n"
+ "fmla z18.s, p0/M, z6.s, z1.s\n"
+ "fmla z19.s, p0/M, z2.s, z1.s\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"addvl x22, x22, #2\n"
"addvl x21, x21, #2\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
+ "fmla z20.s, p0/M, z7.s, z3.s\n"
+ "fmla z21.s, p0/M, z6.s, z3.s\n"
"fmla z22.s, p0/M, z2.s, z3.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "fmla z23.s, p0/M, z0.s, z4.s\n"
- "fmla z24.s, p0/M, z1.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
- "fmla z26.s, p0/M, z0.s, z5.s\n"
+ "fmla z23.s, p0/M, z7.s, z5.s\n"
+ "fmla z24.s, p0/M, z6.s, z5.s\n"
+ "fmla z25.s, p0/M, z2.s, z5.s\n"
+ "fmla z26.s, p0/M, z7.s, z0.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "fmla z27.s, p0/M, z1.s, z5.s\n"
- "fmla z28.s, p0/M, z2.s, z5.s\n"
+ "fmla z27.s, p0/M, z6.s, z0.s\n"
+ "fmla z28.s, p0/M, z2.s, z0.s\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p0/Z, [x26]\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
+ "fmla z29.s, p0/M, z7.s, z1.s\n"
+ "ld1w { z0.s }, p0/Z, [x24]\n"
+ "fmla z30.s, p0/M, z6.s, z1.s\n"
+ "fmla z31.s, p0/M, z2.s, z1.s\n"
"ld1w { z1.s }, p0/Z, [x22]\n"
"ld1w { z2.s }, p0/Z, [x21]\n"
"ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
@@ -188,9 +188,9 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"5:" // main loop skip
"fmla z8.s, p0/M, z0.s, z3.s\n"
"fmla z9.s, p0/M, z1.s, z3.s\n"
- "addvl x26, x26, #1\n"
+ "addvl x24, x24, #1\n"
"fmla z10.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z11.s, p0/M, z0.s, z4.s\n"
"fmla z12.s, p0/M, z1.s, z4.s\n"
"fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -203,11 +203,11 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z17.s, p0/M, z0.s, z6.s\n"
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z20.s, p0/M, z0.s, z7.s\n"
+ "fmla z21.s, p0/M, z1.s, z7.s\n"
"addvl x21, x21, #1\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
+ "fmla z22.s, p0/M, z2.s, z7.s\n"
"fmla z23.s, p0/M, z0.s, z4.s\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z24.s, p0/M, z1.s, z4.s\n"
@@ -215,50 +215,50 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"fmla z26.s, p0/M, z0.s, z5.s\n"
"fmla z27.s, p0/M, z1.s, z5.s\n"
"fmla z28.s, p0/M, z2.s, z5.s\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
+ "fmla z29.s, p0/M, z0.s, z3.s\n"
+ "fmla z30.s, p0/M, z1.s, z3.s\n"
+ "fmla z31.s, p0/M, z2.s, z3.s\n"
"cbz x20, 6f\n"
- "ld1w { z0.s }, p0/Z, [x26]\n"
- "ld1w { z1.s }, p0/Z, [x22]\n"
- "ld1w { z2.s }, p0/Z, [x21]\n"
+ "ld1w { z6.s }, p0/Z, [x24]\n"
+ "ld1w { z5.s }, p0/Z, [x22]\n"
+ "ld1w { z4.s }, p0/Z, [x21]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "fmla z8.s, p0/M, z0.s, z3.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
- "fmla z9.s, p0/M, z1.s, z3.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
- "fmla z10.s, p0/M, z2.s, z3.s\n"
- "fmla z11.s, p0/M, z0.s, z4.s\n"
- "fmla z12.s, p0/M, z1.s, z4.s\n"
- "fmla z13.s, p0/M, z2.s, z4.s\n"
+ "fmla z8.s, p0/M, z6.s, z3.s\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+ "fmla z9.s, p0/M, z5.s, z3.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+ "fmla z10.s, p0/M, z4.s, z3.s\n"
+ "fmla z11.s, p0/M, z6.s, z2.s\n"
+ "fmla z12.s, p0/M, z5.s, z2.s\n"
+ "fmla z13.s, p0/M, z4.s, z2.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
- "fmla z14.s, p0/M, z0.s, z5.s\n"
- "fmla z15.s, p0/M, z1.s, z5.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
- "fmla z16.s, p0/M, z2.s, z5.s\n"
- "fmla z17.s, p0/M, z0.s, z6.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
- "fmla z18.s, p0/M, z1.s, z6.s\n"
- "fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
+ "fmla z14.s, p0/M, z6.s, z1.s\n"
+ "fmla z15.s, p0/M, z5.s, z1.s\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "fmla z16.s, p0/M, z4.s, z1.s\n"
+ "fmla z17.s, p0/M, z6.s, z0.s\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+ "fmla z18.s, p0/M, z5.s, z0.s\n"
+ "fmla z19.s, p0/M, z4.s, z0.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z20.s, p0/M, z6.s, z3.s\n"
+ "fmla z21.s, p0/M, z5.s, z3.s\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
- "fmla z23.s, p0/M, z0.s, z4.s\n"
- "fmla z24.s, p0/M, z1.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
- "fmla z26.s, p0/M, z0.s, z5.s\n"
- "fmla z27.s, p0/M, z1.s, z5.s\n"
- "fmla z28.s, p0/M, z2.s, z5.s\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
+ "fmla z22.s, p0/M, z4.s, z3.s\n"
+ "fmla z23.s, p0/M, z6.s, z2.s\n"
+ "fmla z24.s, p0/M, z5.s, z2.s\n"
+ "fmla z25.s, p0/M, z4.s, z2.s\n"
+ "fmla z26.s, p0/M, z6.s, z1.s\n"
+ "fmla z27.s, p0/M, z5.s, z1.s\n"
+ "fmla z28.s, p0/M, z4.s, z1.s\n"
+ "fmla z29.s, p0/M, z6.s, z0.s\n"
+ "fmla z30.s, p0/M, z5.s, z0.s\n"
+ "fmla z31.s, p0/M, z4.s, z0.s\n"
"6:" // multiply loop done
- "decw x25, ALL, MUL #3\n"
+ "decw x26, ALL, MUL #3\n"
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
- "cmp x25, XZR\n"
+ "cmp x26, XZR\n"
"st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
@@ -289,7 +289,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
"bne 1b\n"
: [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
: [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
index 4a0b31daff..4b20be6f01 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
@@ -52,26 +52,26 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
__asm__ __volatile__(
"ptrue p0.b\n"
"1:" // Height loop
- "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
- "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x24, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x25, %x[Apanel]\n"
"2:" // Width loop
- "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
"cntw x23, ALL, MUL #2\n"
- "add x22, x26, x20, LSL #2\n"
+ "add x22, x24, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
- "cmp x25, x23\n"
+ "cmp x26, x23\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov %x[Apanel], x24\n"
+ "mov %x[Apanel], x25\n"
"bgt 3f\n"
"decw x23\n"
- "cmp x25, x23\n"
- "mov x21, x26\n"
+ "cmp x26, x23\n"
+ "mov x21, x24\n"
"bgt 3f\n"
- "mov x22, x26\n"
+ "mov x22, x24\n"
"3:" // B setup done
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
"cmp x20, #0x2\n"
@@ -84,7 +84,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
"mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
- "ld1w { z4.s }, p0/Z, [x26]\n"
+ "ld1w { z4.s }, p0/Z, [x24]\n"
"mov z15.b, #0x0\n"
"mov z16.b, #0x0\n"
"ld1w { z5.s }, p0/Z, [x22]\n"
@@ -108,19 +108,19 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"4:" // main loop head
"fmla z8.s, z4.s, z0.s[0]\n"
"fmla z11.s, z4.s, z0.s[1]\n"
- "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
"fmla z14.s, z4.s, z0.s[2]\n"
"fmla z17.s, z4.s, z0.s[3]\n"
- "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
"fmla z20.s, z4.s, z1.s[0]\n"
"fmla z23.s, z4.s, z1.s[1]\n"
"sub x20, x20, #0x2\n"
"fmla z26.s, z4.s, z1.s[2]\n"
"fmla z29.s, z4.s, z1.s[3]\n"
- "ld1w { z4.s }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z4.s }, p0/Z, [x24, #1, MUL VL]\n"
"fmla z9.s, z5.s, z0.s[0]\n"
"fmla z12.s, z5.s, z0.s[1]\n"
- "addvl x26, x26, #2\n"
+ "addvl x24, x24, #2\n"
"fmla z15.s, z5.s, z0.s[2]\n"
"fmla z18.s, z5.s, z0.s[3]\n"
"cmp x20, #0x2\n"
@@ -140,35 +140,35 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"fmla z25.s, z6.s, z1.s[1]\n"
"fmla z28.s, z6.s, z1.s[2]\n"
"fmla z31.s, z6.s, z1.s[3]\n"
- "ld1w { z6.s }, p0/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
- "fmla z8.s, z4.s, z2.s[0]\n"
- "fmla z11.s, z4.s, z2.s[1]\n"
- "fmla z14.s, z4.s, z2.s[2]\n"
- "fmla z17.s, z4.s, z2.s[3]\n"
+ "fmla z8.s, z4.s, z3.s[0]\n"
+ "fmla z11.s, z4.s, z3.s[1]\n"
+ "fmla z14.s, z4.s, z3.s[2]\n"
+ "fmla z17.s, z4.s, z3.s[3]\n"
"ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
- "fmla z20.s, z4.s, z3.s[0]\n"
- "fmla z23.s, z4.s, z3.s[1]\n"
- "fmla z26.s, z4.s, z3.s[2]\n"
- "fmla z29.s, z4.s, z3.s[3]\n"
- "ld1w { z4.s }, p0/Z, [x26]\n"
- "fmla z9.s, z5.s, z2.s[0]\n"
- "fmla z12.s, z5.s, z2.s[1]\n"
- "fmla z15.s, z5.s, z2.s[2]\n"
- "fmla z18.s, z5.s, z2.s[3]\n"
- "fmla z21.s, z5.s, z3.s[0]\n"
- "fmla z24.s, z5.s, z3.s[1]\n"
- "fmla z27.s, z5.s, z3.s[2]\n"
- "fmla z30.s, z5.s, z3.s[3]\n"
+ "fmla z20.s, z4.s, z7.s[0]\n"
+ "fmla z23.s, z4.s, z7.s[1]\n"
+ "fmla z26.s, z4.s, z7.s[2]\n"
+ "fmla z29.s, z4.s, z7.s[3]\n"
+ "ld1w { z4.s }, p0/Z, [x24]\n"
+ "fmla z9.s, z5.s, z3.s[0]\n"
+ "fmla z12.s, z5.s, z3.s[1]\n"
+ "fmla z15.s, z5.s, z3.s[2]\n"
+ "fmla z18.s, z5.s, z3.s[3]\n"
+ "fmla z21.s, z5.s, z7.s[0]\n"
+ "fmla z24.s, z5.s, z7.s[1]\n"
+ "fmla z27.s, z5.s, z7.s[2]\n"
+ "fmla z30.s, z5.s, z7.s[3]\n"
"ld1w { z5.s }, p0/Z, [x22]\n"
- "fmla z10.s, z6.s, z2.s[0]\n"
- "fmla z13.s, z6.s, z2.s[1]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z19.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z25.s, z6.s, z3.s[1]\n"
- "fmla z28.s, z6.s, z3.s[2]\n"
- "fmla z31.s, z6.s, z3.s[3]\n"
+ "fmla z10.s, z2.s, z3.s[0]\n"
+ "fmla z13.s, z2.s, z3.s[1]\n"
+ "fmla z16.s, z2.s, z3.s[2]\n"
+ "fmla z19.s, z2.s, z3.s[3]\n"
+ "fmla z22.s, z2.s, z7.s[0]\n"
+ "fmla z25.s, z2.s, z7.s[1]\n"
+ "fmla z28.s, z2.s, z7.s[2]\n"
+ "fmla z31.s, z2.s, z7.s[3]\n"
"ld1w { z6.s }, p0/Z, [x21]\n"
"bge 4b\n"
"5:" // main loop skip
@@ -177,7 +177,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla z14.s, z4.s, z0.s[2]\n"
"fmla z17.s, z4.s, z0.s[3]\n"
- "addvl x26, x26, #1\n"
+ "addvl x24, x24, #1\n"
"fmla z20.s, z4.s, z1.s[0]\n"
"fmla z23.s, z4.s, z1.s[1]\n"
"addvl x22, x22, #1\n"
@@ -201,40 +201,40 @@ void sve_ffinterleaved_fp32_mla_8x3VL(
"fmla z28.s, z6.s, z1.s[2]\n"
"fmla z31.s, z6.s, z1.s[3]\n"
"cbz x20, 6f\n"
- "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
- "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1w { z7.s }, p0/Z, [x26]\n"
- "ld1w { z4.s }, p0/Z, [x22]\n"
- "fmla z8.s, z7.s, z0.s[0]\n"
- "ld1w { z5.s }, p0/Z, [x21]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z14.s, z7.s, z0.s[2]\n"
- "fmla z17.s, z7.s, z0.s[3]\n"
- "fmla z20.s, z7.s, z1.s[0]\n"
- "fmla z23.s, z7.s, z1.s[1]\n"
- "fmla z26.s, z7.s, z1.s[2]\n"
- "fmla z29.s, z7.s, z1.s[3]\n"
- "fmla z9.s, z4.s, z0.s[0]\n"
- "fmla z12.s, z4.s, z0.s[1]\n"
- "fmla z15.s, z4.s, z0.s[2]\n"
- "fmla z18.s, z4.s, z0.s[3]\n"
- "fmla z21.s, z4.s, z1.s[0]\n"
- "fmla z24.s, z4.s, z1.s[1]\n"
- "fmla z27.s, z4.s, z1.s[2]\n"
- "fmla z30.s, z4.s, z1.s[3]\n"
- "fmla z10.s, z5.s, z0.s[0]\n"
- "fmla z13.s, z5.s, z0.s[1]\n"
- "fmla z16.s, z5.s, z0.s[2]\n"
- "fmla z19.s, z5.s, z0.s[3]\n"
- "fmla z22.s, z5.s, z1.s[0]\n"
- "fmla z25.s, z5.s, z1.s[1]\n"
- "fmla z28.s, z5.s, z1.s[2]\n"
- "fmla z31.s, z5.s, z1.s[3]\n"
+ "ld1w { z2.s }, p0/Z, [x24]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
+ "fmla z8.s, z2.s, z4.s[0]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
+ "fmla z11.s, z2.s, z4.s[1]\n"
+ "fmla z14.s, z2.s, z4.s[2]\n"
+ "fmla z17.s, z2.s, z4.s[3]\n"
+ "fmla z20.s, z2.s, z3.s[0]\n"
+ "fmla z23.s, z2.s, z3.s[1]\n"
+ "fmla z26.s, z2.s, z3.s[2]\n"
+ "fmla z29.s, z2.s, z3.s[3]\n"
+ "fmla z9.s, z1.s, z4.s[0]\n"
+ "fmla z12.s, z1.s, z4.s[1]\n"
+ "fmla z15.s, z1.s, z4.s[2]\n"
+ "fmla z18.s, z1.s, z4.s[3]\n"
+ "fmla z21.s, z1.s, z3.s[0]\n"
+ "fmla z24.s, z1.s, z3.s[1]\n"
+ "fmla z27.s, z1.s, z3.s[2]\n"
+ "fmla z30.s, z1.s, z3.s[3]\n"
+ "fmla z10.s, z0.s, z4.s[0]\n"
+ "fmla z13.s, z0.s, z4.s[1]\n"
+ "fmla z16.s, z0.s, z4.s[2]\n"
+ "fmla z19.s, z0.s, z4.s[3]\n"
+ "fmla z22.s, z0.s, z3.s[0]\n"
+ "fmla z25.s, z0.s, z3.s[1]\n"
+ "fmla z28.s, z0.s, z3.s[2]\n"
+ "fmla z31.s, z0.s, z3.s[3]\n"
"6:" // multiply loop done
- "decw x25, ALL, MUL #3\n"
+ "decw x26, ALL, MUL #3\n"
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
- "cmp x25, XZR\n"
+ "cmp x26, XZR\n"
"st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
"st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index 6677c23216..49ccce342e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, bfloat16>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -100,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index f0b00e6251..176f6e0d3a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -140,11 +140,11 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -157,87 +157,87 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ ".inst 0x64604208 // bfdot z8.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460420a // bfdot z10.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x64684208 // bfdot z8.s, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6468420a // bfdot z10.s, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n"
+ ".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n"
+ ".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n"
+ ".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n"
+ ".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n"
"add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ ".inst 0x64604208 // bfdot z8.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x2\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n"
+ ".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n"
+ ".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n"
+ ".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n"
+ ".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n"
+ ".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n"
+ ".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n"
+ ".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n"
"addvl x10, x10, #4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -246,17 +246,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"12:" // Height 1: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -296,15 +296,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 18f\n"
"17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -320,12 +320,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -333,130 +333,130 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"21:" // Height 2: input setup done
"cmp x27, #0x8\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z1.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64614228 // bfdot z8.s, z17.h, z1.h[0]\n"
+ ".inst 0x6460422c // bfdot z12.s, z17.h, z0.h[0]\n"
+ ".inst 0x64614209 // bfdot z9.s, z16.h, z1.h[0]\n"
+ ".inst 0x6460420d // bfdot z13.s, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461422a // bfdot z10.s, z17.h, z1.h[0]\n"
+ ".inst 0x6460422e // bfdot z14.s, z17.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"cmp x27, #0x8\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461420b // bfdot z11.s, z16.h, z1.h[0]\n"
+ ".inst 0x6460420f // bfdot z15.s, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x64694228 // bfdot z8.s, z17.h, z1.h[1]\n"
+ ".inst 0x6468422c // bfdot z12.s, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x64694209 // bfdot z9.s, z16.h, z1.h[1]\n"
+ ".inst 0x6468420d // bfdot z13.s, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x6469422a // bfdot z10.s, z17.h, z1.h[1]\n"
+ ".inst 0x6468422e // bfdot z14.s, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6469420b // bfdot z11.s, z16.h, z1.h[1]\n"
+ ".inst 0x6468420f // bfdot z15.s, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x64714228 // bfdot z8.s, z17.h, z1.h[2]\n"
+ ".inst 0x6470422c // bfdot z12.s, z17.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x64714209 // bfdot z9.s, z16.h, z1.h[2]\n"
+ ".inst 0x6470420d // bfdot z13.s, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6471422a // bfdot z10.s, z17.h, z1.h[2]\n"
+ ".inst 0x6470422e // bfdot z14.s, z17.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6471420b // bfdot z11.s, z16.h, z1.h[2]\n"
+ ".inst 0x6470420f // bfdot z15.s, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x64794228 // bfdot z8.s, z17.h, z1.h[3]\n"
+ ".inst 0x6478422c // bfdot z12.s, z17.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x64794209 // bfdot z9.s, z16.h, z1.h[3]\n"
+ ".inst 0x6478420d // bfdot z13.s, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x6479422a // bfdot z10.s, z17.h, z1.h[3]\n"
+ ".inst 0x6478422e // bfdot z14.s, z17.h, z0.h[3]\n"
+ ".inst 0x6479420b // bfdot z11.s, z16.h, z1.h[3]\n"
+ ".inst 0x6478420f // bfdot z15.s, z16.h, z0.h[3]\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"subs x27, x27, #0x2\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64604228 // bfdot z8.s, z17.h, z0.h[0]\n"
+ ".inst 0x6461422c // bfdot z12.s, z17.h, z1.h[0]\n"
+ ".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n"
+ ".inst 0x6461420d // bfdot z13.s, z16.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n"
+ ".inst 0x6461422e // bfdot z14.s, z17.h, z1.h[0]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n"
+ ".inst 0x6461420f // bfdot z15.s, z16.h, z1.h[0]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n"
+ ".inst 0x6469422c // bfdot z12.s, z17.h, z1.h[1]\n"
+ ".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n"
+ ".inst 0x6469420d // bfdot z13.s, z16.h, z1.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n"
+ ".inst 0x6469422e // bfdot z14.s, z17.h, z1.h[1]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n"
+ ".inst 0x6469420f // bfdot z15.s, z16.h, z1.h[1]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n"
+ ".inst 0x6471422c // bfdot z12.s, z17.h, z1.h[2]\n"
+ ".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n"
+ ".inst 0x6471420d // bfdot z13.s, z16.h, z1.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n"
+ ".inst 0x6471422e // bfdot z14.s, z17.h, z1.h[2]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n"
+ ".inst 0x6471420f // bfdot z15.s, z16.h, z1.h[2]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n"
+ ".inst 0x6479422c // bfdot z12.s, z17.h, z1.h[3]\n"
+ ".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n"
+ ".inst 0x6479420d // bfdot z13.s, z16.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n"
+ ".inst 0x6479422e // bfdot z14.s, z17.h, z1.h[3]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n"
+ ".inst 0x6479420f // bfdot z15.s, z16.h, z1.h[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -466,25 +466,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"add x25, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmin z12.s, p5/M, z12.s, z17.s\n"
+ "fmin z13.s, p5/M, z13.s, z17.s\n"
+ "fmin z14.s, p5/M, z14.s, z17.s\n"
+ "fmin z15.s, p5/M, z15.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
+ "fmax z12.s, p5/M, z12.s, z16.s\n"
+ "fmax z13.s, p5/M, z13.s, z16.s\n"
+ "fmax z14.s, p5/M, z14.s, z16.s\n"
+ "fmax z15.s, p5/M, z15.s, z16.s\n"
"25:" // Height 2: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -532,20 +532,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20]\n"
+ "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 31f\n"
"30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -565,13 +565,13 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -580,86 +580,86 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"34:" // Height 3: input setup done
"cmp x27, #0x8\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1rqh { z0.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ ".inst 0x646242a8 // bfdot z8.s, z21.h, z2.h[0]\n"
+ ".inst 0x646142ac // bfdot z12.s, z21.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646042b0 // bfdot z16.s, z21.h, z0.h[0]\n"
+ ".inst 0x64624289 // bfdot z9.s, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n"
+ ".inst 0x64604291 // bfdot z17.s, z20.h, z0.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"cmp x27, #0x8\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646242aa // bfdot z10.s, z21.h, z2.h[0]\n"
+ ".inst 0x646142ae // bfdot z14.s, z21.h, z1.h[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x646042b2 // bfdot z18.s, z21.h, z0.h[0]\n"
+ ".inst 0x6462428b // bfdot z11.s, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461428f // bfdot z15.s, z20.h, z1.h[0]\n"
+ ".inst 0x64604293 // bfdot z19.s, z20.h, z0.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x646a42a8 // bfdot z8.s, z21.h, z2.h[1]\n"
+ ".inst 0x646942ac // bfdot z12.s, z21.h, z1.h[1]\n"
+ ".inst 0x646842b0 // bfdot z16.s, z21.h, z0.h[1]\n"
+ ".inst 0x646a4289 // bfdot z9.s, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6469428d // bfdot z13.s, z20.h, z1.h[1]\n"
+ ".inst 0x64684291 // bfdot z17.s, z20.h, z0.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x646a42aa // bfdot z10.s, z21.h, z2.h[1]\n"
+ ".inst 0x646942ae // bfdot z14.s, z21.h, z1.h[1]\n"
+ ".inst 0x646842b2 // bfdot z18.s, z21.h, z0.h[1]\n"
+ ".inst 0x646a428b // bfdot z11.s, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6469428f // bfdot z15.s, z20.h, z1.h[1]\n"
+ ".inst 0x64684293 // bfdot z19.s, z20.h, z0.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x647242a8 // bfdot z8.s, z21.h, z2.h[2]\n"
+ ".inst 0x647142ac // bfdot z12.s, z21.h, z1.h[2]\n"
+ ".inst 0x647042b0 // bfdot z16.s, z21.h, z0.h[2]\n"
+ ".inst 0x64724289 // bfdot z9.s, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x6471428d // bfdot z13.s, z20.h, z1.h[2]\n"
+ ".inst 0x64704291 // bfdot z17.s, z20.h, z0.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x647242aa // bfdot z10.s, z21.h, z2.h[2]\n"
+ ".inst 0x647142ae // bfdot z14.s, z21.h, z1.h[2]\n"
+ ".inst 0x647042b2 // bfdot z18.s, z21.h, z0.h[2]\n"
+ ".inst 0x6472428b // bfdot z11.s, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6471428f // bfdot z15.s, z20.h, z1.h[2]\n"
+ ".inst 0x64704293 // bfdot z19.s, z20.h, z0.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x647a42a8 // bfdot z8.s, z21.h, z2.h[3]\n"
+ ".inst 0x647942ac // bfdot z12.s, z21.h, z1.h[3]\n"
+ ".inst 0x647842b0 // bfdot z16.s, z21.h, z0.h[3]\n"
+ ".inst 0x647a4289 // bfdot z9.s, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x6479428d // bfdot z13.s, z20.h, z1.h[3]\n"
+ ".inst 0x64784291 // bfdot z17.s, z20.h, z0.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x647a42aa // bfdot z10.s, z21.h, z2.h[3]\n"
+ ".inst 0x647942ae // bfdot z14.s, z21.h, z1.h[3]\n"
+ ".inst 0x647842b2 // bfdot z18.s, z21.h, z0.h[3]\n"
+ ".inst 0x647a428b // bfdot z11.s, z20.h, z2.h[3]\n"
+ ".inst 0x6479428f // bfdot z15.s, z20.h, z1.h[3]\n"
+ ".inst 0x64784293 // bfdot z19.s, z20.h, z0.h[3]\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -667,79 +667,79 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"subs x27, x27, #0x2\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ ".inst 0x646042a8 // bfdot z8.s, z21.h, z0.h[0]\n"
+ ".inst 0x646142ac // bfdot z12.s, z21.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646242b0 // bfdot z16.s, z21.h, z2.h[0]\n"
+ ".inst 0x64604289 // bfdot z9.s, z20.h, z0.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n"
+ ".inst 0x64624291 // bfdot z17.s, z20.h, z2.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646042aa // bfdot z10.s, z21.h, z0.h[0]\n"
+ ".inst 0x646142ae // bfdot z14.s, z21.h, z1.h[0]\n"
+ ".inst 0x646242b2 // bfdot z18.s, z21.h, z2.h[0]\n"
+ ".inst 0x6460428b // bfdot z11.s, z20.h, z0.h[0]\n"
+ ".inst 0x6461428f // bfdot z15.s, z20.h, z1.h[0]\n"
+ ".inst 0x64624293 // bfdot z19.s, z20.h, z2.h[0]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646842a8 // bfdot z8.s, z21.h, z0.h[1]\n"
+ ".inst 0x646942ac // bfdot z12.s, z21.h, z1.h[1]\n"
+ ".inst 0x646a42b0 // bfdot z16.s, z21.h, z2.h[1]\n"
+ ".inst 0x64684289 // bfdot z9.s, z20.h, z0.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6469428d // bfdot z13.s, z20.h, z1.h[1]\n"
+ ".inst 0x646a4291 // bfdot z17.s, z20.h, z2.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646842aa // bfdot z10.s, z21.h, z0.h[1]\n"
+ ".inst 0x646942ae // bfdot z14.s, z21.h, z1.h[1]\n"
+ ".inst 0x646a42b2 // bfdot z18.s, z21.h, z2.h[1]\n"
+ ".inst 0x6468428b // bfdot z11.s, z20.h, z0.h[1]\n"
+ ".inst 0x6469428f // bfdot z15.s, z20.h, z1.h[1]\n"
+ ".inst 0x646a4293 // bfdot z19.s, z20.h, z2.h[1]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x647042a8 // bfdot z8.s, z21.h, z0.h[2]\n"
+ ".inst 0x647142ac // bfdot z12.s, z21.h, z1.h[2]\n"
+ ".inst 0x647242b0 // bfdot z16.s, z21.h, z2.h[2]\n"
+ ".inst 0x64704289 // bfdot z9.s, z20.h, z0.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471428d // bfdot z13.s, z20.h, z1.h[2]\n"
+ ".inst 0x64724291 // bfdot z17.s, z20.h, z2.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647042aa // bfdot z10.s, z21.h, z0.h[2]\n"
+ ".inst 0x647142ae // bfdot z14.s, z21.h, z1.h[2]\n"
+ ".inst 0x647242b2 // bfdot z18.s, z21.h, z2.h[2]\n"
+ ".inst 0x6470428b // bfdot z11.s, z20.h, z0.h[2]\n"
+ ".inst 0x6471428f // bfdot z15.s, z20.h, z1.h[2]\n"
+ ".inst 0x64724293 // bfdot z19.s, z20.h, z2.h[2]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x647842a8 // bfdot z8.s, z21.h, z0.h[3]\n"
+ ".inst 0x647942ac // bfdot z12.s, z21.h, z1.h[3]\n"
+ ".inst 0x647a42b0 // bfdot z16.s, z21.h, z2.h[3]\n"
+ ".inst 0x64784289 // bfdot z9.s, z20.h, z0.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6479428d // bfdot z13.s, z20.h, z1.h[3]\n"
+ ".inst 0x647a4291 // bfdot z17.s, z20.h, z2.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647842aa // bfdot z10.s, z21.h, z0.h[3]\n"
+ ".inst 0x647942ae // bfdot z14.s, z21.h, z1.h[3]\n"
+ ".inst 0x647a42b2 // bfdot z18.s, z21.h, z2.h[3]\n"
+ ".inst 0x6478428b // bfdot z11.s, z20.h, z0.h[3]\n"
+ ".inst 0x6479428f // bfdot z15.s, z20.h, z1.h[3]\n"
+ ".inst 0x647a4293 // bfdot z19.s, z20.h, z2.h[3]\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -750,33 +750,33 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z20.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
+ "fmin z12.s, p5/M, z12.s, z21.s\n"
+ "fmin z13.s, p5/M, z13.s, z21.s\n"
+ "fmin z14.s, p5/M, z14.s, z21.s\n"
+ "fmin z15.s, p5/M, z15.s, z21.s\n"
+ "fmin z16.s, p5/M, z16.s, z21.s\n"
+ "fmin z17.s, p5/M, z17.s, z21.s\n"
+ "fmin z18.s, p5/M, z18.s, z21.s\n"
+ "fmin z19.s, p5/M, z19.s, z21.s\n"
+ "fmax z8.s, p5/M, z8.s, z20.s\n"
+ "fmax z9.s, p5/M, z9.s, z20.s\n"
+ "fmax z10.s, p5/M, z10.s, z20.s\n"
+ "fmax z11.s, p5/M, z11.s, z20.s\n"
+ "fmax z12.s, p5/M, z12.s, z20.s\n"
+ "fmax z13.s, p5/M, z13.s, z20.s\n"
+ "fmax z14.s, p5/M, z14.s, z20.s\n"
+ "fmax z15.s, p5/M, z15.s, z20.s\n"
+ "fmax z16.s, p5/M, z16.s, z20.s\n"
+ "fmax z17.s, p5/M, z17.s, z20.s\n"
+ "fmax z18.s, p5/M, z18.s, z20.s\n"
+ "fmax z19.s, p5/M, z19.s, z20.s\n"
"38:" // Height 3: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -832,25 +832,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21]\n"
+ "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 44f\n"
"43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -874,14 +874,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -891,105 +891,105 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"47:" // Height 4: input setup done
"cmp x27, #0x8\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z3.h }, p0/Z, [x26]\n"
+ "ld1rqh { z2.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "ld1rqh { z0.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64634328 // bfdot z8.s, z25.h, z3.h[0]\n"
+ ".inst 0x6462432c // bfdot z12.s, z25.h, z2.h[0]\n"
+ ".inst 0x64614330 // bfdot z16.s, z25.h, z1.h[0]\n"
+ ".inst 0x64604334 // bfdot z20.s, z25.h, z0.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ ".inst 0x64634309 // bfdot z9.s, z24.h, z3.h[0]\n"
+ ".inst 0x6462430d // bfdot z13.s, z24.h, z2.h[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x64614311 // bfdot z17.s, z24.h, z1.h[0]\n"
+ ".inst 0x64604315 // bfdot z21.s, z24.h, z0.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6463432a // bfdot z10.s, z25.h, z3.h[0]\n"
+ ".inst 0x6462432e // bfdot z14.s, z25.h, z2.h[0]\n"
+ ".inst 0x64614332 // bfdot z18.s, z25.h, z1.h[0]\n"
+ ".inst 0x64604336 // bfdot z22.s, z25.h, z0.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6463430b // bfdot z11.s, z24.h, z3.h[0]\n"
+ ".inst 0x6462430f // bfdot z15.s, z24.h, z2.h[0]\n"
+ ".inst 0x64614313 // bfdot z19.s, z24.h, z1.h[0]\n"
+ ".inst 0x64604317 // bfdot z23.s, z24.h, z0.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x646b4328 // bfdot z8.s, z25.h, z3.h[1]\n"
+ ".inst 0x646a432c // bfdot z12.s, z25.h, z2.h[1]\n"
+ ".inst 0x64694330 // bfdot z16.s, z25.h, z1.h[1]\n"
+ ".inst 0x64684334 // bfdot z20.s, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x646b4309 // bfdot z9.s, z24.h, z3.h[1]\n"
+ ".inst 0x646a430d // bfdot z13.s, z24.h, z2.h[1]\n"
+ ".inst 0x64694311 // bfdot z17.s, z24.h, z1.h[1]\n"
+ ".inst 0x64684315 // bfdot z21.s, z24.h, z0.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x646b432a // bfdot z10.s, z25.h, z3.h[1]\n"
+ ".inst 0x646a432e // bfdot z14.s, z25.h, z2.h[1]\n"
+ ".inst 0x64694332 // bfdot z18.s, z25.h, z1.h[1]\n"
+ ".inst 0x64684336 // bfdot z22.s, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x646b430b // bfdot z11.s, z24.h, z3.h[1]\n"
+ ".inst 0x646a430f // bfdot z15.s, z24.h, z2.h[1]\n"
+ ".inst 0x64694313 // bfdot z19.s, z24.h, z1.h[1]\n"
+ ".inst 0x64684317 // bfdot z23.s, z24.h, z0.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x64734328 // bfdot z8.s, z25.h, z3.h[2]\n"
+ ".inst 0x6472432c // bfdot z12.s, z25.h, z2.h[2]\n"
+ ".inst 0x64714330 // bfdot z16.s, z25.h, z1.h[2]\n"
+ ".inst 0x64704334 // bfdot z20.s, z25.h, z0.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x64734309 // bfdot z9.s, z24.h, z3.h[2]\n"
+ ".inst 0x6472430d // bfdot z13.s, z24.h, z2.h[2]\n"
+ ".inst 0x64714311 // bfdot z17.s, z24.h, z1.h[2]\n"
+ ".inst 0x64704315 // bfdot z21.s, z24.h, z0.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6473432a // bfdot z10.s, z25.h, z3.h[2]\n"
+ ".inst 0x6472432e // bfdot z14.s, z25.h, z2.h[2]\n"
+ ".inst 0x64714332 // bfdot z18.s, z25.h, z1.h[2]\n"
+ ".inst 0x64704336 // bfdot z22.s, z25.h, z0.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6473430b // bfdot z11.s, z24.h, z3.h[2]\n"
+ ".inst 0x6472430f // bfdot z15.s, z24.h, z2.h[2]\n"
+ ".inst 0x64714313 // bfdot z19.s, z24.h, z1.h[2]\n"
+ ".inst 0x64704317 // bfdot z23.s, z24.h, z0.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x647b4328 // bfdot z8.s, z25.h, z3.h[3]\n"
+ ".inst 0x647a432c // bfdot z12.s, z25.h, z2.h[3]\n"
+ ".inst 0x64794330 // bfdot z16.s, z25.h, z1.h[3]\n"
+ ".inst 0x64784334 // bfdot z20.s, z25.h, z0.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x647b4309 // bfdot z9.s, z24.h, z3.h[3]\n"
+ ".inst 0x647a430d // bfdot z13.s, z24.h, z2.h[3]\n"
+ ".inst 0x64794311 // bfdot z17.s, z24.h, z1.h[3]\n"
+ ".inst 0x64784315 // bfdot z21.s, z24.h, z0.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x647b432a // bfdot z10.s, z25.h, z3.h[3]\n"
+ ".inst 0x647a432e // bfdot z14.s, z25.h, z2.h[3]\n"
+ ".inst 0x64794332 // bfdot z18.s, z25.h, z1.h[3]\n"
+ ".inst 0x64784336 // bfdot z22.s, z25.h, z0.h[3]\n"
+ ".inst 0x647b430b // bfdot z11.s, z24.h, z3.h[3]\n"
+ ".inst 0x647a430f // bfdot z15.s, z24.h, z2.h[3]\n"
+ ".inst 0x64794313 // bfdot z19.s, z24.h, z1.h[3]\n"
+ ".inst 0x64784317 // bfdot z23.s, z24.h, z0.h[3]\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -998,95 +998,95 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"subs x27, x27, #0x2\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64604328 // bfdot z8.s, z25.h, z0.h[0]\n"
+ ".inst 0x6461432c // bfdot z12.s, z25.h, z1.h[0]\n"
+ ".inst 0x64624330 // bfdot z16.s, z25.h, z2.h[0]\n"
+ ".inst 0x64634334 // bfdot z20.s, z25.h, z3.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x64604309 // bfdot z9.s, z24.h, z0.h[0]\n"
+ ".inst 0x6461430d // bfdot z13.s, z24.h, z1.h[0]\n"
+ ".inst 0x64624311 // bfdot z17.s, z24.h, z2.h[0]\n"
+ ".inst 0x64634315 // bfdot z21.s, z24.h, z3.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x6460432a // bfdot z10.s, z25.h, z0.h[0]\n"
+ ".inst 0x6461432e // bfdot z14.s, z25.h, z1.h[0]\n"
+ ".inst 0x64624332 // bfdot z18.s, z25.h, z2.h[0]\n"
+ ".inst 0x64634336 // bfdot z22.s, z25.h, z3.h[0]\n"
+ ".inst 0x6460430b // bfdot z11.s, z24.h, z0.h[0]\n"
+ ".inst 0x6461430f // bfdot z15.s, z24.h, z1.h[0]\n"
+ ".inst 0x64624313 // bfdot z19.s, z24.h, z2.h[0]\n"
+ ".inst 0x64634317 // bfdot z23.s, z24.h, z3.h[0]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64684328 // bfdot z8.s, z25.h, z0.h[1]\n"
+ ".inst 0x6469432c // bfdot z12.s, z25.h, z1.h[1]\n"
+ ".inst 0x646a4330 // bfdot z16.s, z25.h, z2.h[1]\n"
+ ".inst 0x646b4334 // bfdot z20.s, z25.h, z3.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x64684309 // bfdot z9.s, z24.h, z0.h[1]\n"
+ ".inst 0x6469430d // bfdot z13.s, z24.h, z1.h[1]\n"
+ ".inst 0x646a4311 // bfdot z17.s, z24.h, z2.h[1]\n"
+ ".inst 0x646b4315 // bfdot z21.s, z24.h, z3.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x6468432a // bfdot z10.s, z25.h, z0.h[1]\n"
+ ".inst 0x6469432e // bfdot z14.s, z25.h, z1.h[1]\n"
+ ".inst 0x646a4332 // bfdot z18.s, z25.h, z2.h[1]\n"
+ ".inst 0x646b4336 // bfdot z22.s, z25.h, z3.h[1]\n"
+ ".inst 0x6468430b // bfdot z11.s, z24.h, z0.h[1]\n"
+ ".inst 0x6469430f // bfdot z15.s, z24.h, z1.h[1]\n"
+ ".inst 0x646a4313 // bfdot z19.s, z24.h, z2.h[1]\n"
+ ".inst 0x646b4317 // bfdot z23.s, z24.h, z3.h[1]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64704328 // bfdot z8.s, z25.h, z0.h[2]\n"
+ ".inst 0x6471432c // bfdot z12.s, z25.h, z1.h[2]\n"
+ ".inst 0x64724330 // bfdot z16.s, z25.h, z2.h[2]\n"
+ ".inst 0x64734334 // bfdot z20.s, z25.h, z3.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x64704309 // bfdot z9.s, z24.h, z0.h[2]\n"
+ ".inst 0x6471430d // bfdot z13.s, z24.h, z1.h[2]\n"
+ ".inst 0x64724311 // bfdot z17.s, z24.h, z2.h[2]\n"
+ ".inst 0x64734315 // bfdot z21.s, z24.h, z3.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x6470432a // bfdot z10.s, z25.h, z0.h[2]\n"
+ ".inst 0x6471432e // bfdot z14.s, z25.h, z1.h[2]\n"
+ ".inst 0x64724332 // bfdot z18.s, z25.h, z2.h[2]\n"
+ ".inst 0x64734336 // bfdot z22.s, z25.h, z3.h[2]\n"
+ ".inst 0x6470430b // bfdot z11.s, z24.h, z0.h[2]\n"
+ ".inst 0x6471430f // bfdot z15.s, z24.h, z1.h[2]\n"
+ ".inst 0x64724313 // bfdot z19.s, z24.h, z2.h[2]\n"
+ ".inst 0x64734317 // bfdot z23.s, z24.h, z3.h[2]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64784328 // bfdot z8.s, z25.h, z0.h[3]\n"
+ ".inst 0x6479432c // bfdot z12.s, z25.h, z1.h[3]\n"
+ ".inst 0x647a4330 // bfdot z16.s, z25.h, z2.h[3]\n"
+ ".inst 0x647b4334 // bfdot z20.s, z25.h, z3.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x64784309 // bfdot z9.s, z24.h, z0.h[3]\n"
+ ".inst 0x6479430d // bfdot z13.s, z24.h, z1.h[3]\n"
+ ".inst 0x647a4311 // bfdot z17.s, z24.h, z2.h[3]\n"
+ ".inst 0x647b4315 // bfdot z21.s, z24.h, z3.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x6478432a // bfdot z10.s, z25.h, z0.h[3]\n"
+ ".inst 0x6479432e // bfdot z14.s, z25.h, z1.h[3]\n"
+ ".inst 0x647a4332 // bfdot z18.s, z25.h, z2.h[3]\n"
+ ".inst 0x647b4336 // bfdot z22.s, z25.h, z3.h[3]\n"
+ ".inst 0x6478430b // bfdot z11.s, z24.h, z0.h[3]\n"
+ ".inst 0x6479430f // bfdot z15.s, z24.h, z1.h[3]\n"
+ ".inst 0x647a4313 // bfdot z19.s, z24.h, z2.h[3]\n"
+ ".inst 0x647b4317 // bfdot z23.s, z24.h, z3.h[3]\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1098,41 +1098,41 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z23.s, p5/M, z23.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z23.s, p5/M, z23.s, z0.s\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z25.s\n"
+ "fmin z9.s, p5/M, z9.s, z25.s\n"
+ "fmin z10.s, p5/M, z10.s, z25.s\n"
+ "fmin z11.s, p5/M, z11.s, z25.s\n"
+ "fmin z12.s, p5/M, z12.s, z25.s\n"
+ "fmin z13.s, p5/M, z13.s, z25.s\n"
+ "fmin z14.s, p5/M, z14.s, z25.s\n"
+ "fmin z15.s, p5/M, z15.s, z25.s\n"
+ "fmin z16.s, p5/M, z16.s, z25.s\n"
+ "fmin z17.s, p5/M, z17.s, z25.s\n"
+ "fmin z18.s, p5/M, z18.s, z25.s\n"
+ "fmin z19.s, p5/M, z19.s, z25.s\n"
+ "fmin z20.s, p5/M, z20.s, z25.s\n"
+ "fmin z21.s, p5/M, z21.s, z25.s\n"
+ "fmin z22.s, p5/M, z22.s, z25.s\n"
+ "fmin z23.s, p5/M, z23.s, z25.s\n"
+ "fmax z8.s, p5/M, z8.s, z24.s\n"
+ "fmax z9.s, p5/M, z9.s, z24.s\n"
+ "fmax z10.s, p5/M, z10.s, z24.s\n"
+ "fmax z11.s, p5/M, z11.s, z24.s\n"
+ "fmax z12.s, p5/M, z12.s, z24.s\n"
+ "fmax z13.s, p5/M, z13.s, z24.s\n"
+ "fmax z14.s, p5/M, z14.s, z24.s\n"
+ "fmax z15.s, p5/M, z15.s, z24.s\n"
+ "fmax z16.s, p5/M, z16.s, z24.s\n"
+ "fmax z17.s, p5/M, z17.s, z24.s\n"
+ "fmax z18.s, p5/M, z18.s, z24.s\n"
+ "fmax z19.s, p5/M, z19.s, z24.s\n"
+ "fmax z20.s, p5/M, z20.s, z24.s\n"
+ "fmax z21.s, p5/M, z21.s, z24.s\n"
+ "fmax z22.s, p5/M, z22.s, z24.s\n"
+ "fmax z23.s, p5/M, z23.s, z24.s\n"
"51:" // Height 4: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1196,30 +1196,30 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x22]\n"
- "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 57f\n"
"56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1247,15 +1247,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1266,124 +1266,124 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"b 60f\n"
"59:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"60:" // Height 5: input setup done
"cmp x27, #0x8\n"
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x26]\n"
+ "ld1rqh { z3.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "ld1rqh { z0.h }, p0/Z, [x22]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ ".inst 0x646443a8 // bfdot z8.s, z29.h, z4.h[0]\n"
+ ".inst 0x646343ac // bfdot z12.s, z29.h, z3.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646243b0 // bfdot z16.s, z29.h, z2.h[0]\n"
+ ".inst 0x646143b4 // bfdot z20.s, z29.h, z1.h[0]\n"
"add x25, x25, #0x10\n"
- ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646043b8 // bfdot z24.s, z29.h, z0.h[0]\n"
+ ".inst 0x64644389 // bfdot z9.s, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x6463438d // bfdot z13.s, z28.h, z3.h[0]\n"
+ ".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
- ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x64614395 // bfdot z21.s, z28.h, z1.h[0]\n"
+ ".inst 0x64604399 // bfdot z25.s, z28.h, z0.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x646443aa // bfdot z10.s, z29.h, z4.h[0]\n"
+ ".inst 0x646343ae // bfdot z14.s, z29.h, z3.h[0]\n"
+ ".inst 0x646243b2 // bfdot z18.s, z29.h, z2.h[0]\n"
+ ".inst 0x646143b6 // bfdot z22.s, z29.h, z1.h[0]\n"
+ ".inst 0x646043ba // bfdot z26.s, z29.h, z0.h[0]\n"
+ ".inst 0x6464438b // bfdot z11.s, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6463438f // bfdot z15.s, z28.h, z3.h[0]\n"
+ ".inst 0x64624393 // bfdot z19.s, z28.h, z2.h[0]\n"
+ ".inst 0x64614397 // bfdot z23.s, z28.h, z1.h[0]\n"
+ ".inst 0x6460439b // bfdot z27.s, z28.h, z0.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x646c43a8 // bfdot z8.s, z29.h, z4.h[1]\n"
+ ".inst 0x646b43ac // bfdot z12.s, z29.h, z3.h[1]\n"
+ ".inst 0x646a43b0 // bfdot z16.s, z29.h, z2.h[1]\n"
+ ".inst 0x646943b4 // bfdot z20.s, z29.h, z1.h[1]\n"
+ ".inst 0x646843b8 // bfdot z24.s, z29.h, z0.h[1]\n"
+ ".inst 0x646c4389 // bfdot z9.s, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x646b438d // bfdot z13.s, z28.h, z3.h[1]\n"
+ ".inst 0x646a4391 // bfdot z17.s, z28.h, z2.h[1]\n"
+ ".inst 0x64694395 // bfdot z21.s, z28.h, z1.h[1]\n"
+ ".inst 0x64684399 // bfdot z25.s, z28.h, z0.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
- ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ ".inst 0x646c43aa // bfdot z10.s, z29.h, z4.h[1]\n"
+ ".inst 0x646b43ae // bfdot z14.s, z29.h, z3.h[1]\n"
+ ".inst 0x646a43b2 // bfdot z18.s, z29.h, z2.h[1]\n"
+ ".inst 0x646943b6 // bfdot z22.s, z29.h, z1.h[1]\n"
+ ".inst 0x646843ba // bfdot z26.s, z29.h, z0.h[1]\n"
+ ".inst 0x646c438b // bfdot z11.s, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x646b438f // bfdot z15.s, z28.h, z3.h[1]\n"
+ ".inst 0x646a4393 // bfdot z19.s, z28.h, z2.h[1]\n"
+ ".inst 0x64694397 // bfdot z23.s, z28.h, z1.h[1]\n"
+ ".inst 0x6468439b // bfdot z27.s, z28.h, z0.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x647443a8 // bfdot z8.s, z29.h, z4.h[2]\n"
+ ".inst 0x647343ac // bfdot z12.s, z29.h, z3.h[2]\n"
+ ".inst 0x647243b0 // bfdot z16.s, z29.h, z2.h[2]\n"
+ ".inst 0x647143b4 // bfdot z20.s, z29.h, z1.h[2]\n"
+ ".inst 0x647043b8 // bfdot z24.s, z29.h, z0.h[2]\n"
+ ".inst 0x64744389 // bfdot z9.s, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x6473438d // bfdot z13.s, z28.h, z3.h[2]\n"
+ ".inst 0x64724391 // bfdot z17.s, z28.h, z2.h[2]\n"
+ ".inst 0x64714395 // bfdot z21.s, z28.h, z1.h[2]\n"
+ ".inst 0x64704399 // bfdot z25.s, z28.h, z0.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x647443aa // bfdot z10.s, z29.h, z4.h[2]\n"
+ ".inst 0x647343ae // bfdot z14.s, z29.h, z3.h[2]\n"
+ ".inst 0x647243b2 // bfdot z18.s, z29.h, z2.h[2]\n"
+ ".inst 0x647143b6 // bfdot z22.s, z29.h, z1.h[2]\n"
+ ".inst 0x647043ba // bfdot z26.s, z29.h, z0.h[2]\n"
+ ".inst 0x6474438b // bfdot z11.s, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6473438f // bfdot z15.s, z28.h, z3.h[2]\n"
+ ".inst 0x64724393 // bfdot z19.s, z28.h, z2.h[2]\n"
+ ".inst 0x64714397 // bfdot z23.s, z28.h, z1.h[2]\n"
+ ".inst 0x6470439b // bfdot z27.s, z28.h, z0.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x647c43a8 // bfdot z8.s, z29.h, z4.h[3]\n"
+ ".inst 0x647b43ac // bfdot z12.s, z29.h, z3.h[3]\n"
+ ".inst 0x647a43b0 // bfdot z16.s, z29.h, z2.h[3]\n"
+ ".inst 0x647943b4 // bfdot z20.s, z29.h, z1.h[3]\n"
+ ".inst 0x647843b8 // bfdot z24.s, z29.h, z0.h[3]\n"
+ ".inst 0x647c4389 // bfdot z9.s, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x647b438d // bfdot z13.s, z28.h, z3.h[3]\n"
+ ".inst 0x647a4391 // bfdot z17.s, z28.h, z2.h[3]\n"
+ ".inst 0x64794395 // bfdot z21.s, z28.h, z1.h[3]\n"
+ ".inst 0x64784399 // bfdot z25.s, z28.h, z0.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x647c43aa // bfdot z10.s, z29.h, z4.h[3]\n"
+ ".inst 0x647b43ae // bfdot z14.s, z29.h, z3.h[3]\n"
+ ".inst 0x647a43b2 // bfdot z18.s, z29.h, z2.h[3]\n"
+ ".inst 0x647943b6 // bfdot z22.s, z29.h, z1.h[3]\n"
+ ".inst 0x647843ba // bfdot z26.s, z29.h, z0.h[3]\n"
+ ".inst 0x647c438b // bfdot z11.s, z28.h, z4.h[3]\n"
+ ".inst 0x647b438f // bfdot z15.s, z28.h, z3.h[3]\n"
+ ".inst 0x647a4393 // bfdot z19.s, z28.h, z2.h[3]\n"
+ ".inst 0x64794397 // bfdot z23.s, z28.h, z1.h[3]\n"
+ ".inst 0x6478439b // bfdot z27.s, z28.h, z0.h[3]\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -1393,111 +1393,111 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ ".inst 0x646043a8 // bfdot z8.s, z29.h, z0.h[0]\n"
+ ".inst 0x646143ac // bfdot z12.s, z29.h, z1.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646243b0 // bfdot z16.s, z29.h, z2.h[0]\n"
+ ".inst 0x646343b4 // bfdot z20.s, z29.h, z3.h[0]\n"
+ ".inst 0x646443b8 // bfdot z24.s, z29.h, z4.h[0]\n"
+ ".inst 0x64604389 // bfdot z9.s, z28.h, z0.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461438d // bfdot z13.s, z28.h, z1.h[0]\n"
+ ".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n"
+ ".inst 0x64634395 // bfdot z21.s, z28.h, z3.h[0]\n"
+ ".inst 0x64644399 // bfdot z25.s, z28.h, z4.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ ".inst 0x646043aa // bfdot z10.s, z29.h, z0.h[0]\n"
+ ".inst 0x646143ae // bfdot z14.s, z29.h, z1.h[0]\n"
+ ".inst 0x646243b2 // bfdot z18.s, z29.h, z2.h[0]\n"
+ ".inst 0x646343b6 // bfdot z22.s, z29.h, z3.h[0]\n"
+ ".inst 0x646443ba // bfdot z26.s, z29.h, z4.h[0]\n"
+ ".inst 0x6460438b // bfdot z11.s, z28.h, z0.h[0]\n"
+ ".inst 0x6461438f // bfdot z15.s, z28.h, z1.h[0]\n"
+ ".inst 0x64624393 // bfdot z19.s, z28.h, z2.h[0]\n"
+ ".inst 0x64634397 // bfdot z23.s, z28.h, z3.h[0]\n"
+ ".inst 0x6464439b // bfdot z27.s, z28.h, z4.h[0]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646843a8 // bfdot z8.s, z29.h, z0.h[1]\n"
+ ".inst 0x646943ac // bfdot z12.s, z29.h, z1.h[1]\n"
+ ".inst 0x646a43b0 // bfdot z16.s, z29.h, z2.h[1]\n"
+ ".inst 0x646b43b4 // bfdot z20.s, z29.h, z3.h[1]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x646c43b8 // bfdot z24.s, z29.h, z4.h[1]\n"
+ ".inst 0x64684389 // bfdot z9.s, z28.h, z0.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6469438d // bfdot z13.s, z28.h, z1.h[1]\n"
+ ".inst 0x646a4391 // bfdot z17.s, z28.h, z2.h[1]\n"
+ ".inst 0x646b4395 // bfdot z21.s, z28.h, z3.h[1]\n"
+ ".inst 0x646c4399 // bfdot z25.s, z28.h, z4.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ ".inst 0x646843aa // bfdot z10.s, z29.h, z0.h[1]\n"
+ ".inst 0x646943ae // bfdot z14.s, z29.h, z1.h[1]\n"
+ ".inst 0x646a43b2 // bfdot z18.s, z29.h, z2.h[1]\n"
+ ".inst 0x646b43b6 // bfdot z22.s, z29.h, z3.h[1]\n"
+ ".inst 0x646c43ba // bfdot z26.s, z29.h, z4.h[1]\n"
+ ".inst 0x6468438b // bfdot z11.s, z28.h, z0.h[1]\n"
+ ".inst 0x6469438f // bfdot z15.s, z28.h, z1.h[1]\n"
+ ".inst 0x646a4393 // bfdot z19.s, z28.h, z2.h[1]\n"
+ ".inst 0x646b4397 // bfdot z23.s, z28.h, z3.h[1]\n"
+ ".inst 0x646c439b // bfdot z27.s, z28.h, z4.h[1]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x647043a8 // bfdot z8.s, z29.h, z0.h[2]\n"
+ ".inst 0x647143ac // bfdot z12.s, z29.h, z1.h[2]\n"
+ ".inst 0x647243b0 // bfdot z16.s, z29.h, z2.h[2]\n"
+ ".inst 0x647343b4 // bfdot z20.s, z29.h, z3.h[2]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x647443b8 // bfdot z24.s, z29.h, z4.h[2]\n"
+ ".inst 0x64704389 // bfdot z9.s, z28.h, z0.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6471438d // bfdot z13.s, z28.h, z1.h[2]\n"
+ ".inst 0x64724391 // bfdot z17.s, z28.h, z2.h[2]\n"
+ ".inst 0x64734395 // bfdot z21.s, z28.h, z3.h[2]\n"
+ ".inst 0x64744399 // bfdot z25.s, z28.h, z4.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ ".inst 0x647043aa // bfdot z10.s, z29.h, z0.h[2]\n"
+ ".inst 0x647143ae // bfdot z14.s, z29.h, z1.h[2]\n"
+ ".inst 0x647243b2 // bfdot z18.s, z29.h, z2.h[2]\n"
+ ".inst 0x647343b6 // bfdot z22.s, z29.h, z3.h[2]\n"
+ ".inst 0x647443ba // bfdot z26.s, z29.h, z4.h[2]\n"
+ ".inst 0x6470438b // bfdot z11.s, z28.h, z0.h[2]\n"
+ ".inst 0x6471438f // bfdot z15.s, z28.h, z1.h[2]\n"
+ ".inst 0x64724393 // bfdot z19.s, z28.h, z2.h[2]\n"
+ ".inst 0x64734397 // bfdot z23.s, z28.h, z3.h[2]\n"
+ ".inst 0x6474439b // bfdot z27.s, z28.h, z4.h[2]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x647843a8 // bfdot z8.s, z29.h, z0.h[3]\n"
+ ".inst 0x647943ac // bfdot z12.s, z29.h, z1.h[3]\n"
+ ".inst 0x647a43b0 // bfdot z16.s, z29.h, z2.h[3]\n"
+ ".inst 0x647b43b4 // bfdot z20.s, z29.h, z3.h[3]\n"
+ ".inst 0x647c43b8 // bfdot z24.s, z29.h, z4.h[3]\n"
+ ".inst 0x64784389 // bfdot z9.s, z28.h, z0.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6479438d // bfdot z13.s, z28.h, z1.h[3]\n"
+ ".inst 0x647a4391 // bfdot z17.s, z28.h, z2.h[3]\n"
+ ".inst 0x647b4395 // bfdot z21.s, z28.h, z3.h[3]\n"
+ ".inst 0x647c4399 // bfdot z25.s, z28.h, z4.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ ".inst 0x647843aa // bfdot z10.s, z29.h, z0.h[3]\n"
+ ".inst 0x647943ae // bfdot z14.s, z29.h, z1.h[3]\n"
+ ".inst 0x647a43b2 // bfdot z18.s, z29.h, z2.h[3]\n"
+ ".inst 0x647b43b6 // bfdot z22.s, z29.h, z3.h[3]\n"
+ ".inst 0x647c43ba // bfdot z26.s, z29.h, z4.h[3]\n"
+ ".inst 0x6478438b // bfdot z11.s, z28.h, z0.h[3]\n"
+ ".inst 0x6479438f // bfdot z15.s, z28.h, z1.h[3]\n"
+ ".inst 0x647a4393 // bfdot z19.s, z28.h, z2.h[3]\n"
+ ".inst 0x647b4397 // bfdot z23.s, z28.h, z3.h[3]\n"
+ ".inst 0x647c439b // bfdot z27.s, z28.h, z4.h[3]\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1510,49 +1510,49 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z29.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z23.s, p5/M, z23.s, z1.s\n"
- "fmin z24.s, p5/M, z24.s, z1.s\n"
- "fmin z25.s, p5/M, z25.s, z1.s\n"
- "fmin z26.s, p5/M, z26.s, z1.s\n"
- "fmin z27.s, p5/M, z27.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z23.s, p5/M, z23.s, z0.s\n"
- "fmax z24.s, p5/M, z24.s, z0.s\n"
- "fmax z25.s, p5/M, z25.s, z0.s\n"
- "fmax z26.s, p5/M, z26.s, z0.s\n"
- "fmax z27.s, p5/M, z27.s, z0.s\n"
+ "ld1rw { z28.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z29.s\n"
+ "fmin z9.s, p5/M, z9.s, z29.s\n"
+ "fmin z10.s, p5/M, z10.s, z29.s\n"
+ "fmin z11.s, p5/M, z11.s, z29.s\n"
+ "fmin z12.s, p5/M, z12.s, z29.s\n"
+ "fmin z13.s, p5/M, z13.s, z29.s\n"
+ "fmin z14.s, p5/M, z14.s, z29.s\n"
+ "fmin z15.s, p5/M, z15.s, z29.s\n"
+ "fmin z16.s, p5/M, z16.s, z29.s\n"
+ "fmin z17.s, p5/M, z17.s, z29.s\n"
+ "fmin z18.s, p5/M, z18.s, z29.s\n"
+ "fmin z19.s, p5/M, z19.s, z29.s\n"
+ "fmin z20.s, p5/M, z20.s, z29.s\n"
+ "fmin z21.s, p5/M, z21.s, z29.s\n"
+ "fmin z22.s, p5/M, z22.s, z29.s\n"
+ "fmin z23.s, p5/M, z23.s, z29.s\n"
+ "fmin z24.s, p5/M, z24.s, z29.s\n"
+ "fmin z25.s, p5/M, z25.s, z29.s\n"
+ "fmin z26.s, p5/M, z26.s, z29.s\n"
+ "fmin z27.s, p5/M, z27.s, z29.s\n"
+ "fmax z8.s, p5/M, z8.s, z28.s\n"
+ "fmax z9.s, p5/M, z9.s, z28.s\n"
+ "fmax z10.s, p5/M, z10.s, z28.s\n"
+ "fmax z11.s, p5/M, z11.s, z28.s\n"
+ "fmax z12.s, p5/M, z12.s, z28.s\n"
+ "fmax z13.s, p5/M, z13.s, z28.s\n"
+ "fmax z14.s, p5/M, z14.s, z28.s\n"
+ "fmax z15.s, p5/M, z15.s, z28.s\n"
+ "fmax z16.s, p5/M, z16.s, z28.s\n"
+ "fmax z17.s, p5/M, z17.s, z28.s\n"
+ "fmax z18.s, p5/M, z18.s, z28.s\n"
+ "fmax z19.s, p5/M, z19.s, z28.s\n"
+ "fmax z20.s, p5/M, z20.s, z28.s\n"
+ "fmax z21.s, p5/M, z21.s, z28.s\n"
+ "fmax z22.s, p5/M, z22.s, z28.s\n"
+ "fmax z23.s, p5/M, z23.s, z28.s\n"
+ "fmax z24.s, p5/M, z24.s, z28.s\n"
+ "fmax z25.s, p5/M, z25.s, z28.s\n"
+ "fmax z26.s, p5/M, z26.s, z28.s\n"
+ "fmax z27.s, p5/M, z27.s, z28.s\n"
"64:" // Height 5: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1627,35 +1627,35 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x22]\n"
- "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 70f\n"
"69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1687,16 +1687,16 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 73f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1708,143 +1708,143 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"b 73f\n"
"72:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"73:" // Height 6: input setup done
"cmp x27, #0x8\n"
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z6.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z4.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1rqh { z5.h }, p0/Z, [x21]\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "ld1rqh { z2.h }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x64674028 // bfdot z8.s, z1.h, z7.h[0]\n"
+ ".inst 0x6466402c // bfdot z12.s, z1.h, z6.h[0]\n"
+ ".inst 0x64654030 // bfdot z16.s, z1.h, z5.h[0]\n"
+ ".inst 0x64644034 // bfdot z20.s, z1.h, z4.h[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x64634038 // bfdot z24.s, z1.h, z3.h[0]\n"
+ ".inst 0x6462403c // bfdot z28.s, z1.h, z2.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x21, x21, #0x10\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
- ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
- ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
- ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
- ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
- ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x64674009 // bfdot z9.s, z0.h, z7.h[0]\n"
+ ".inst 0x6466400d // bfdot z13.s, z0.h, z6.h[0]\n"
+ ".inst 0x64654011 // bfdot z17.s, z0.h, z5.h[0]\n"
+ ".inst 0x64644015 // bfdot z21.s, z0.h, z4.h[0]\n"
+ ".inst 0x64634019 // bfdot z25.s, z0.h, z3.h[0]\n"
+ ".inst 0x6462401d // bfdot z29.s, z0.h, z2.h[0]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6467402a // bfdot z10.s, z1.h, z7.h[0]\n"
+ ".inst 0x6466402e // bfdot z14.s, z1.h, z6.h[0]\n"
+ ".inst 0x64654032 // bfdot z18.s, z1.h, z5.h[0]\n"
+ ".inst 0x64644036 // bfdot z22.s, z1.h, z4.h[0]\n"
+ ".inst 0x6463403a // bfdot z26.s, z1.h, z3.h[0]\n"
+ ".inst 0x6462403e // bfdot z30.s, z1.h, z2.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6467400b // bfdot z11.s, z0.h, z7.h[0]\n"
+ ".inst 0x6466400f // bfdot z15.s, z0.h, z6.h[0]\n"
+ ".inst 0x64654013 // bfdot z19.s, z0.h, z5.h[0]\n"
+ ".inst 0x64644017 // bfdot z23.s, z0.h, z4.h[0]\n"
+ ".inst 0x6463401b // bfdot z27.s, z0.h, z3.h[0]\n"
+ ".inst 0x6462401f // bfdot z31.s, z0.h, z2.h[0]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x646f4028 // bfdot z8.s, z1.h, z7.h[1]\n"
+ ".inst 0x646e402c // bfdot z12.s, z1.h, z6.h[1]\n"
+ ".inst 0x646d4030 // bfdot z16.s, z1.h, z5.h[1]\n"
+ ".inst 0x646c4034 // bfdot z20.s, z1.h, z4.h[1]\n"
+ ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n"
+ ".inst 0x646a403c // bfdot z28.s, z1.h, z2.h[1]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x646f4009 // bfdot z9.s, z0.h, z7.h[1]\n"
+ ".inst 0x646e400d // bfdot z13.s, z0.h, z6.h[1]\n"
+ ".inst 0x646d4011 // bfdot z17.s, z0.h, z5.h[1]\n"
+ ".inst 0x646c4015 // bfdot z21.s, z0.h, z4.h[1]\n"
+ ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n"
+ ".inst 0x646a401d // bfdot z29.s, z0.h, z2.h[1]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
- ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
- ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
- ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
- ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
- ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
- ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
- ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
- ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
- ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
- ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
- ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
+ ".inst 0x646f402a // bfdot z10.s, z1.h, z7.h[1]\n"
+ ".inst 0x646e402e // bfdot z14.s, z1.h, z6.h[1]\n"
+ ".inst 0x646d4032 // bfdot z18.s, z1.h, z5.h[1]\n"
+ ".inst 0x646c4036 // bfdot z22.s, z1.h, z4.h[1]\n"
+ ".inst 0x646b403a // bfdot z26.s, z1.h, z3.h[1]\n"
+ ".inst 0x646a403e // bfdot z30.s, z1.h, z2.h[1]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x646f400b // bfdot z11.s, z0.h, z7.h[1]\n"
+ ".inst 0x646e400f // bfdot z15.s, z0.h, z6.h[1]\n"
+ ".inst 0x646d4013 // bfdot z19.s, z0.h, z5.h[1]\n"
+ ".inst 0x646c4017 // bfdot z23.s, z0.h, z4.h[1]\n"
+ ".inst 0x646b401b // bfdot z27.s, z0.h, z3.h[1]\n"
+ ".inst 0x646a401f // bfdot z31.s, z0.h, z2.h[1]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x64774028 // bfdot z8.s, z1.h, z7.h[2]\n"
+ ".inst 0x6476402c // bfdot z12.s, z1.h, z6.h[2]\n"
+ ".inst 0x64754030 // bfdot z16.s, z1.h, z5.h[2]\n"
+ ".inst 0x64744034 // bfdot z20.s, z1.h, z4.h[2]\n"
+ ".inst 0x64734038 // bfdot z24.s, z1.h, z3.h[2]\n"
+ ".inst 0x6472403c // bfdot z28.s, z1.h, z2.h[2]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x64774009 // bfdot z9.s, z0.h, z7.h[2]\n"
+ ".inst 0x6476400d // bfdot z13.s, z0.h, z6.h[2]\n"
+ ".inst 0x64754011 // bfdot z17.s, z0.h, z5.h[2]\n"
+ ".inst 0x64744015 // bfdot z21.s, z0.h, z4.h[2]\n"
+ ".inst 0x64734019 // bfdot z25.s, z0.h, z3.h[2]\n"
+ ".inst 0x6472401d // bfdot z29.s, z0.h, z2.h[2]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6477402a // bfdot z10.s, z1.h, z7.h[2]\n"
+ ".inst 0x6476402e // bfdot z14.s, z1.h, z6.h[2]\n"
+ ".inst 0x64754032 // bfdot z18.s, z1.h, z5.h[2]\n"
+ ".inst 0x64744036 // bfdot z22.s, z1.h, z4.h[2]\n"
+ ".inst 0x6473403a // bfdot z26.s, z1.h, z3.h[2]\n"
+ ".inst 0x6472403e // bfdot z30.s, z1.h, z2.h[2]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6477400b // bfdot z11.s, z0.h, z7.h[2]\n"
+ ".inst 0x6476400f // bfdot z15.s, z0.h, z6.h[2]\n"
+ ".inst 0x64754013 // bfdot z19.s, z0.h, z5.h[2]\n"
+ ".inst 0x64744017 // bfdot z23.s, z0.h, z4.h[2]\n"
+ ".inst 0x6473401b // bfdot z27.s, z0.h, z3.h[2]\n"
+ ".inst 0x6472401f // bfdot z31.s, z0.h, z2.h[2]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x647f4028 // bfdot z8.s, z1.h, z7.h[3]\n"
+ ".inst 0x647e402c // bfdot z12.s, z1.h, z6.h[3]\n"
+ ".inst 0x647d4030 // bfdot z16.s, z1.h, z5.h[3]\n"
+ ".inst 0x647c4034 // bfdot z20.s, z1.h, z4.h[3]\n"
+ ".inst 0x647b4038 // bfdot z24.s, z1.h, z3.h[3]\n"
+ ".inst 0x647a403c // bfdot z28.s, z1.h, z2.h[3]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x647f4009 // bfdot z9.s, z0.h, z7.h[3]\n"
+ ".inst 0x647e400d // bfdot z13.s, z0.h, z6.h[3]\n"
+ ".inst 0x647d4011 // bfdot z17.s, z0.h, z5.h[3]\n"
+ ".inst 0x647c4015 // bfdot z21.s, z0.h, z4.h[3]\n"
+ ".inst 0x647b4019 // bfdot z25.s, z0.h, z3.h[3]\n"
+ ".inst 0x647a401d // bfdot z29.s, z0.h, z2.h[3]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x647f402a // bfdot z10.s, z1.h, z7.h[3]\n"
+ ".inst 0x647e402e // bfdot z14.s, z1.h, z6.h[3]\n"
+ ".inst 0x647d4032 // bfdot z18.s, z1.h, z5.h[3]\n"
+ ".inst 0x647c4036 // bfdot z22.s, z1.h, z4.h[3]\n"
+ ".inst 0x647b403a // bfdot z26.s, z1.h, z3.h[3]\n"
+ ".inst 0x647a403e // bfdot z30.s, z1.h, z2.h[3]\n"
+ ".inst 0x647f400b // bfdot z11.s, z0.h, z7.h[3]\n"
+ ".inst 0x647e400f // bfdot z15.s, z0.h, z6.h[3]\n"
+ ".inst 0x647d4013 // bfdot z19.s, z0.h, z5.h[3]\n"
+ ".inst 0x647c4017 // bfdot z23.s, z0.h, z4.h[3]\n"
+ ".inst 0x647b401b // bfdot z27.s, z0.h, z3.h[3]\n"
+ ".inst 0x647a401f // bfdot z31.s, z0.h, z2.h[3]\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -1855,127 +1855,127 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
"ld1rqh { z5.h }, p0/Z, [x21]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ec // bfdot z12.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f0 // bfdot z16.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f4 // bfdot z20.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440f8 // bfdot z24.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540fc // bfdot z28.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646040c9 // bfdot z9.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140cd // bfdot z13.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d1 // bfdot z17.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d5 // bfdot z21.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440d9 // bfdot z25.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540dd // bfdot z29.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
- ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
- ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
- ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
+ ".inst 0x646040ea // bfdot z10.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ee // bfdot z14.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f2 // bfdot z18.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f6 // bfdot z22.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fa // bfdot z26.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540fe // bfdot z30.s, z7.h, z5.h[0]\n"
+ ".inst 0x646040cb // bfdot z11.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140cf // bfdot z15.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d3 // bfdot z19.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d7 // bfdot z23.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440db // bfdot z27.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540df // bfdot z31.s, z6.h, z5.h[0]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x646840e8 // bfdot z8.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ec // bfdot z12.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f0 // bfdot z16.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f4 // bfdot z20.s, z7.h, z3.h[1]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
- ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
- ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x646c40f8 // bfdot z24.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40fc // bfdot z28.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646840c9 // bfdot z9.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cd // bfdot z13.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d5 // bfdot z21.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d9 // bfdot z25.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40dd // bfdot z29.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
- ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
- ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
+ ".inst 0x646840ea // bfdot z10.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ee // bfdot z14.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f2 // bfdot z18.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f6 // bfdot z22.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fa // bfdot z26.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40fe // bfdot z30.s, z7.h, z5.h[1]\n"
+ ".inst 0x646840cb // bfdot z11.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cf // bfdot z15.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d3 // bfdot z19.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d7 // bfdot z23.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40db // bfdot z27.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40df // bfdot z31.s, z6.h, z5.h[1]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x647040e8 // bfdot z8.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ec // bfdot z12.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f0 // bfdot z16.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f4 // bfdot z20.s, z7.h, z3.h[2]\n"
"subs x27, x27, #0x2\n"
- ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
- ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
- ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x647440f8 // bfdot z24.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540fc // bfdot z28.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x647040c9 // bfdot z9.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cd // bfdot z13.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d1 // bfdot z17.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d5 // bfdot z21.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d9 // bfdot z25.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540dd // bfdot z29.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
- ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
- ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
- ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
+ ".inst 0x647040ea // bfdot z10.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ee // bfdot z14.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f2 // bfdot z18.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f6 // bfdot z22.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fa // bfdot z26.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540fe // bfdot z30.s, z7.h, z5.h[2]\n"
+ ".inst 0x647040cb // bfdot z11.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cf // bfdot z15.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d3 // bfdot z19.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d7 // bfdot z23.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440db // bfdot z27.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540df // bfdot z31.s, z6.h, z5.h[2]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
- ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
- ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x647840e8 // bfdot z8.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ec // bfdot z12.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f0 // bfdot z16.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f4 // bfdot z20.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f8 // bfdot z24.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40fc // bfdot z28.s, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x647840c9 // bfdot z9.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cd // bfdot z13.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d1 // bfdot z17.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d5 // bfdot z21.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d9 // bfdot z25.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40dd // bfdot z29.s, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
- ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
- ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
- ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
- ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
- ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n"
- ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
- ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
- ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
+ ".inst 0x647840ea // bfdot z10.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ee // bfdot z14.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f2 // bfdot z18.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f6 // bfdot z22.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fa // bfdot z26.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40fe // bfdot z30.s, z7.h, z5.h[3]\n"
+ ".inst 0x647840cb // bfdot z11.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cf // bfdot z15.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d7 // bfdot z23.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40db // bfdot z27.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40df // bfdot z31.s, z6.h, z5.h[3]\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2082,7 +2082,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2090,4 +2089,4 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
index b8d237ff23..223d8a78de 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, bfloat16>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -100,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
index 9bb67f18d2..74e2d267bc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -133,16 +133,16 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 5f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 4f\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z16.d, z12.d\n"
+ "zip2 z12.d, z16.d, z12.d\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 5f\n"
@@ -160,11 +160,11 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -176,86 +176,86 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
+ ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
+ ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6470e68a // bfmmla z10.s, z20.h, z16.h\n"
+ ".inst 0x6471e68e // bfmmla z14.s, z20.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
+ ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
"add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"addvl x10, x10, #8\n"
"ble 11f\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n"
+ ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
"addvl x10, x10, #8\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -268,17 +268,17 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"12:" // Height 1: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -322,21 +322,21 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x9, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 18f\n"
@@ -354,12 +354,12 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -367,95 +367,95 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"21:" // Height 2: input setup done
"cmp x27, #0x8\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqh { z20.h }, p0/Z, [x26]\n"
+ "ld1rqh { z19.h }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n"
+ ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n"
+ ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n"
+ ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n"
+ ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqh { z19.h }, p0/Z, [x25]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"addvl x10, x10, #8\n"
"ble 24f\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n"
+ ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n"
+ ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n"
+ ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n"
+ "ld1h { z22.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6476e42b // bfmmla z11.s, z1.h, z22.h\n"
+ ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n"
"addvl x10, x10, #8\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -474,25 +474,25 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp2 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z17.s\n"
+ "fmin z12.s, p5/M, z12.s, z17.s\n"
+ "fmin z13.s, p5/M, z13.s, z17.s\n"
+ "fmin z14.s, p5/M, z14.s, z17.s\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z7.s, p5/M, z7.s, z16.s\n"
+ "fmax z12.s, p5/M, z12.s, z16.s\n"
+ "fmax z13.s, p5/M, z13.s, z16.s\n"
+ "fmax z14.s, p5/M, z14.s, z16.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"25:" // Height 2: No activation
"st1w { z7.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -548,28 +548,28 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -601,13 +601,13 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -616,136 +616,136 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"34:" // Height 3: input setup done
"cmp x27, #0x8\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "ld1rqh { z28.h }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
+ ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
"cmp x27, #0x8\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
+ ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
+ ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
+ ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
+ ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
+ ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
+ ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
+ ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
+ ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn1 z27.d, z1.d, z24.d\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
+ ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
+ ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 37f\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
+ ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
+ ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
+ ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
+ ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n"
+ ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n"
+ ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n"
+ ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n"
+ ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n"
+ ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n"
+ ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -768,33 +768,33 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z25.s\n"
+ "fmin z12.s, p5/M, z12.s, z25.s\n"
+ "fmin z13.s, p5/M, z13.s, z25.s\n"
+ "fmin z14.s, p5/M, z14.s, z25.s\n"
+ "fmin z8.s, p5/M, z8.s, z25.s\n"
+ "fmin z9.s, p5/M, z9.s, z25.s\n"
+ "fmin z10.s, p5/M, z10.s, z25.s\n"
+ "fmin z11.s, p5/M, z11.s, z25.s\n"
+ "fmin z16.s, p5/M, z16.s, z25.s\n"
+ "fmin z17.s, p5/M, z17.s, z25.s\n"
+ "fmin z18.s, p5/M, z18.s, z25.s\n"
+ "fmin z19.s, p5/M, z19.s, z25.s\n"
+ "fmax z7.s, p5/M, z7.s, z24.s\n"
+ "fmax z12.s, p5/M, z12.s, z24.s\n"
+ "fmax z13.s, p5/M, z13.s, z24.s\n"
+ "fmax z14.s, p5/M, z14.s, z24.s\n"
+ "fmax z8.s, p5/M, z8.s, z24.s\n"
+ "fmax z9.s, p5/M, z9.s, z24.s\n"
+ "fmax z10.s, p5/M, z10.s, z24.s\n"
+ "fmax z11.s, p5/M, z11.s, z24.s\n"
+ "fmax z16.s, p5/M, z16.s, z24.s\n"
+ "fmax z17.s, p5/M, z17.s, z24.s\n"
+ "fmax z18.s, p5/M, z18.s, z24.s\n"
+ "fmax z19.s, p5/M, z19.s, z24.s\n"
"38:" // Height 3: No activation
"st1w { z7.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -854,37 +854,37 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -912,14 +912,14 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -929,140 +929,140 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"47:" // Height 4: input setup done
"cmp x27, #0x8\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "ld1rqh { z30.h }, p0/Z, [x26]\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
+ "ld1rqh { z28.h }, p0/Z, [x24]\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
"cmp x27, #0x8\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n"
+ ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n"
+ ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n"
+ ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n"
+ ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n"
+ ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n"
+ ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n"
+ ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n"
+ ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n"
+ ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqh { z24.h }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1rqh { z27.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"ble 50f\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n"
+ ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n"
+ ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n"
+ ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n"
+ ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n"
+ ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n"
+ ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n"
+ ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
+ ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n"
+ ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n"
+ ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n"
+ ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1090,41 +1090,41 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp2 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z23.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z24.s\n"
+ "fmin z12.s, p5/M, z12.s, z24.s\n"
+ "fmin z13.s, p5/M, z13.s, z24.s\n"
+ "fmin z14.s, p5/M, z14.s, z24.s\n"
+ "fmin z8.s, p5/M, z8.s, z24.s\n"
+ "fmin z9.s, p5/M, z9.s, z24.s\n"
+ "fmin z10.s, p5/M, z10.s, z24.s\n"
+ "fmin z11.s, p5/M, z11.s, z24.s\n"
+ "fmin z15.s, p5/M, z15.s, z24.s\n"
+ "fmin z20.s, p5/M, z20.s, z24.s\n"
+ "fmin z21.s, p5/M, z21.s, z24.s\n"
+ "fmin z22.s, p5/M, z22.s, z24.s\n"
+ "fmin z16.s, p5/M, z16.s, z24.s\n"
+ "fmin z17.s, p5/M, z17.s, z24.s\n"
+ "fmin z18.s, p5/M, z18.s, z24.s\n"
+ "fmin z19.s, p5/M, z19.s, z24.s\n"
+ "fmax z7.s, p5/M, z7.s, z23.s\n"
+ "fmax z12.s, p5/M, z12.s, z23.s\n"
+ "fmax z13.s, p5/M, z13.s, z23.s\n"
+ "fmax z14.s, p5/M, z14.s, z23.s\n"
+ "fmax z8.s, p5/M, z8.s, z23.s\n"
+ "fmax z9.s, p5/M, z9.s, z23.s\n"
+ "fmax z10.s, p5/M, z10.s, z23.s\n"
+ "fmax z11.s, p5/M, z11.s, z23.s\n"
+ "fmax z15.s, p5/M, z15.s, z23.s\n"
+ "fmax z20.s, p5/M, z20.s, z23.s\n"
+ "fmax z21.s, p5/M, z21.s, z23.s\n"
+ "fmax z22.s, p5/M, z22.s, z23.s\n"
+ "fmax z16.s, p5/M, z16.s, z23.s\n"
+ "fmax z17.s, p5/M, z17.s, z23.s\n"
+ "fmax z18.s, p5/M, z18.s, z23.s\n"
+ "fmax z19.s, p5/M, z19.s, z23.s\n"
"51:" // Height 4: No activation
"st1w { z7.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1196,54 +1196,54 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 57f\n"
"56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1275,15 +1275,15 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1294,180 +1294,180 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 60f\n"
"59:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"60:" // Height 5: input setup done
"cmp x27, #0x8\n"
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqh { z6.h }, p0/Z, [x26]\n"
+ "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z7.h }, p0/Z, [x24]\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "trn1 z3.d, z7.d, z2.d\n"
+ "trn2 z7.d, z7.d, z2.d\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6461e4a8 // bfmmla z8.s, z5.h, z1.h\n"
+ ".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
+ ".inst 0x6460e4ac // bfmmla z12.s, z5.h, z0.h\n"
+ ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n"
+ ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
+ ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n"
+ ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n"
+ ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n"
+ ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n"
+ ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n"
+ ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
+ ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ ".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n"
+ ".inst 0x6461e498 // bfmmla z24.s, z4.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ ".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ ".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n"
+ ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
+ ".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n"
+ ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x25]\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
+ ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n"
+ ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
+ ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
+ ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"ble 63f\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
+ ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
+ ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
+ ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
+ ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
+ ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n"
+ ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n"
+ ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n"
+ ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n"
+ ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1500,49 +1500,49 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"uzp1 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z7.s, p5/M, z7.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z24.s, p5/M, z24.s, z1.s\n"
- "fmin z25.s, p5/M, z25.s, z1.s\n"
- "fmin z26.s, p5/M, z26.s, z1.s\n"
- "fmin z27.s, p5/M, z27.s, z1.s\n"
- "fmax z7.s, p5/M, z7.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z24.s, p5/M, z24.s, z0.s\n"
- "fmax z25.s, p5/M, z25.s, z0.s\n"
- "fmax z26.s, p5/M, z26.s, z0.s\n"
- "fmax z27.s, p5/M, z27.s, z0.s\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z23.s }, p5/Z, [x20]\n"
+ "fmin z7.s, p5/M, z7.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z7.s, p5/M, z7.s, z23.s\n"
+ "fmax z12.s, p5/M, z12.s, z23.s\n"
+ "fmax z13.s, p5/M, z13.s, z23.s\n"
+ "fmax z14.s, p5/M, z14.s, z23.s\n"
+ "fmax z8.s, p5/M, z8.s, z23.s\n"
+ "fmax z9.s, p5/M, z9.s, z23.s\n"
+ "fmax z10.s, p5/M, z10.s, z23.s\n"
+ "fmax z11.s, p5/M, z11.s, z23.s\n"
+ "fmax z15.s, p5/M, z15.s, z23.s\n"
+ "fmax z20.s, p5/M, z20.s, z23.s\n"
+ "fmax z21.s, p5/M, z21.s, z23.s\n"
+ "fmax z22.s, p5/M, z22.s, z23.s\n"
+ "fmax z16.s, p5/M, z16.s, z23.s\n"
+ "fmax z17.s, p5/M, z17.s, z23.s\n"
+ "fmax z18.s, p5/M, z18.s, z23.s\n"
+ "fmax z19.s, p5/M, z19.s, z23.s\n"
+ "fmax z24.s, p5/M, z24.s, z23.s\n"
+ "fmax z25.s, p5/M, z25.s, z23.s\n"
+ "fmax z26.s, p5/M, z26.s, z23.s\n"
+ "fmax z27.s, p5/M, z27.s, z23.s\n"
"64:" // Height 5: No activation
"st1w { z7.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1621,59 +1621,59 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
+ "add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
"zip2 z15.d, z16.d, z15.d\n"
"zip1 z16.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
"zip1 z18.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 70f\n"
"69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1705,16 +1705,16 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 73f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1726,184 +1726,184 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
"b 73f\n"
"72:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"73:" // Height 6: input setup done
"cmp x27, #0x8\n"
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1rqh { z5.h }, p0/Z, [x22]\n"
- "ld1rqh { z6.h }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "ld1rqh { z0.h }, p0/Z, [x21]\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
"sub x27, x27, #0x8\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
+ ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
+ ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
"add x21, x21, #0x10\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
+ ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
+ ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
+ ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z1.h }, p0/Z, [x26]\n"
- "ld1rqh { z2.h }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqh { z3.h }, p0/Z, [x24]\n"
- "ld1rqh { z4.h }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqh { z5.h }, p0/Z, [x22]\n"
- "ld1rqh { z6.h }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n"
- ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n"
- ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqh { z0.h }, p0/Z, [x21]\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n"
+ ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
- ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n"
- ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n"
- ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n"
- ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
+ ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n"
+ ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n"
+ ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n"
+ ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n"
- ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n"
- ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n"
+ ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n"
+ ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"ble 76f\n"
- "ld1h { z7.h }, p5/Z, [x10]\n"
- "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n"
- ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n"
- ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n"
- ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n"
- ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n"
- ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n"
- ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
- ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n"
- ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n"
- ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n"
- ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n"
- ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z2.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n"
+ ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
+ ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n"
+ ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n"
+ ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n"
+ ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n"
+ ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n"
+ "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n"
+ ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n"
- ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n"
- ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n"
- ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n"
+ ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n"
+ ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n"
+ ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n"
+ ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n"
+ ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2041,4 +2041,4 @@ void sve_hybrid_bf16fp32_mmla_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index 6db9c0cdf3..b930e4c0d5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,16 +10,16 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
#ifdef ARM_COMPUTE_ENABLE_SVE
@@ -75,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, __fp16>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -84,6 +83,8 @@ public:
return { 12.44 };
case CPUModel::V1:
return { 31.51 };
+ case CPUModel::A64FX:
+ return { 49.14 };
}
}
@@ -107,5 +108,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
index a70e66cbe4..d1a9bb4a26 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -139,11 +139,11 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -159,12 +159,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"9:" // Height 1: Multiply loop: Main loop
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x2\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
@@ -174,27 +174,27 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
"addvl x10, x10, #4\n"
"bne 6b\n"
"tbz %x[flags], #1, 11f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
+ "ld1rh { z16.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z17.h\n"
+ "fmin z9.h, p4/M, z9.h, z17.h\n"
+ "fmin z10.h, p4/M, z10.h, z17.h\n"
+ "fmin z11.h, p4/M, z11.h, z17.h\n"
+ "fmax z8.h, p4/M, z8.h, z16.h\n"
+ "fmax z9.h, p4/M, z9.h, z16.h\n"
+ "fmax z10.h, p4/M, z10.h, z16.h\n"
+ "fmax z11.h, p4/M, z11.h, z16.h\n"
"11:" // Height 1: No activation
"st1h { z8.h }, p3, [x9]\n"
"st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -234,15 +234,15 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"15:" // Height 2: no bias
"tbz %x[flags], #0, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
+ "add x20, x9, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x20]\n"
+ "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 17f\n"
"16:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -258,12 +258,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -271,7 +271,7 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"20:" // Height 2: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -282,18 +282,18 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"21:" // Height 2: Multiply loop: Main loop
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
"add x26, x26, #0x2\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"subs x27, x27, #0x1\n"
"add x25, x25, #0x2\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z14.h, p4/M, z17.h, z1.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
+ "fmla z15.h, p4/M, z16.h, z1.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
@@ -303,41 +303,41 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.h, p4/M, z6.h, z0.h\n"
"fmla z12.h, p4/M, z6.h, z1.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
+ "fmla z10.h, p4/M, z17.h, z0.h\n"
+ "fmla z14.h, p4/M, z17.h, z1.h\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
+ "fmla z11.h, p4/M, z16.h, z0.h\n"
+ "fmla z15.h, p4/M, z16.h, z1.h\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #1\n"
"tbz %x[flags], #1, 23f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z17.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
+ "ld1rh { z16.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z17.h\n"
+ "fmin z9.h, p4/M, z9.h, z17.h\n"
+ "fmin z10.h, p4/M, z10.h, z17.h\n"
+ "fmin z11.h, p4/M, z11.h, z17.h\n"
+ "fmin z12.h, p4/M, z12.h, z17.h\n"
+ "fmin z13.h, p4/M, z13.h, z17.h\n"
+ "fmin z14.h, p4/M, z14.h, z17.h\n"
+ "fmin z15.h, p4/M, z15.h, z17.h\n"
+ "fmax z8.h, p4/M, z8.h, z16.h\n"
+ "fmax z9.h, p4/M, z9.h, z16.h\n"
+ "fmax z10.h, p4/M, z10.h, z16.h\n"
+ "fmax z11.h, p4/M, z11.h, z16.h\n"
+ "fmax z12.h, p4/M, z12.h, z16.h\n"
+ "fmax z13.h, p4/M, z13.h, z16.h\n"
+ "fmax z14.h, p4/M, z14.h, z16.h\n"
+ "fmax z15.h, p4/M, z15.h, z16.h\n"
"23:" // Height 2: No activation
"st1h { z8.h }, p3, [x9]\n"
"st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -385,20 +385,20 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"27:" // Height 3: no bias
"tbz %x[flags], #0, 28f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x21, x9, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x21]\n"
+ "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x20]\n"
+ "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 29f\n"
"28:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -418,13 +418,13 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 32f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -433,8 +433,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"b 32f\n"
"31:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"32:" // Height 3: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -450,21 +450,21 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x24, x24, #0x2\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z21.h, z0.h\n"
+ "fmla z14.h, p4/M, z21.h, z1.h\n"
+ "fmla z18.h, p4/M, z21.h, z2.h\n"
+ "fmla z11.h, p4/M, z20.h, z0.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
+ "fmla z15.h, p4/M, z20.h, z1.h\n"
+ "fmla z19.h, p4/M, z20.h, z2.h\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -476,51 +476,51 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
+ "fmla z10.h, p4/M, z21.h, z0.h\n"
+ "fmla z14.h, p4/M, z21.h, z1.h\n"
+ "fmla z18.h, p4/M, z21.h, z2.h\n"
+ "fmla z11.h, p4/M, z20.h, z0.h\n"
+ "fmla z15.h, p4/M, z20.h, z1.h\n"
+ "fmla z19.h, p4/M, z20.h, z2.h\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #1\n"
"add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 35f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z21.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmin z16.h, p4/M, z16.h, z1.h\n"
- "fmin z17.h, p4/M, z17.h, z1.h\n"
- "fmin z18.h, p4/M, z18.h, z1.h\n"
- "fmin z19.h, p4/M, z19.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
- "fmax z16.h, p4/M, z16.h, z0.h\n"
- "fmax z17.h, p4/M, z17.h, z0.h\n"
- "fmax z18.h, p4/M, z18.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z0.h\n"
+ "ld1rh { z20.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z21.h\n"
+ "fmin z9.h, p4/M, z9.h, z21.h\n"
+ "fmin z10.h, p4/M, z10.h, z21.h\n"
+ "fmin z11.h, p4/M, z11.h, z21.h\n"
+ "fmin z12.h, p4/M, z12.h, z21.h\n"
+ "fmin z13.h, p4/M, z13.h, z21.h\n"
+ "fmin z14.h, p4/M, z14.h, z21.h\n"
+ "fmin z15.h, p4/M, z15.h, z21.h\n"
+ "fmin z16.h, p4/M, z16.h, z21.h\n"
+ "fmin z17.h, p4/M, z17.h, z21.h\n"
+ "fmin z18.h, p4/M, z18.h, z21.h\n"
+ "fmin z19.h, p4/M, z19.h, z21.h\n"
+ "fmax z8.h, p4/M, z8.h, z20.h\n"
+ "fmax z9.h, p4/M, z9.h, z20.h\n"
+ "fmax z10.h, p4/M, z10.h, z20.h\n"
+ "fmax z11.h, p4/M, z11.h, z20.h\n"
+ "fmax z12.h, p4/M, z12.h, z20.h\n"
+ "fmax z13.h, p4/M, z13.h, z20.h\n"
+ "fmax z14.h, p4/M, z14.h, z20.h\n"
+ "fmax z15.h, p4/M, z15.h, z20.h\n"
+ "fmax z16.h, p4/M, z16.h, z20.h\n"
+ "fmax z17.h, p4/M, z17.h, z20.h\n"
+ "fmax z18.h, p4/M, z18.h, z20.h\n"
+ "fmax z19.h, p4/M, z19.h, z20.h\n"
"35:" // Height 3: No activation
"st1h { z8.h }, p3, [x9]\n"
"st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -576,25 +576,25 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"39:" // Height 4: no bias
"tbz %x[flags], #0, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x22, x9, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p3/Z, [x9]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p3/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x22]\n"
+ "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x21]\n"
+ "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x20]\n"
+ "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 41f\n"
"40:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -618,14 +618,14 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"42:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 43f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -635,9 +635,9 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"b 44f\n"
"43:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"44:" // Height 4: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -654,7 +654,7 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x2\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
@@ -662,19 +662,19 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x23, x23, #0x2\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
+ "fmla z10.h, p4/M, z25.h, z0.h\n"
+ "fmla z14.h, p4/M, z25.h, z1.h\n"
+ "fmla z18.h, p4/M, z25.h, z2.h\n"
+ "fmla z22.h, p4/M, z25.h, z3.h\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
+ "fmla z11.h, p4/M, z24.h, z0.h\n"
+ "fmla z15.h, p4/M, z24.h, z1.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
+ "fmla z19.h, p4/M, z24.h, z2.h\n"
+ "fmla z23.h, p4/M, z24.h, z3.h\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
"ld1rh { z3.h }, p4/Z, [x23]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -686,22 +686,22 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.h, p4/M, z6.h, z2.h\n"
"fmla z20.h, p4/M, z6.h, z3.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
+ "fmla z10.h, p4/M, z25.h, z0.h\n"
+ "fmla z14.h, p4/M, z25.h, z1.h\n"
+ "fmla z18.h, p4/M, z25.h, z2.h\n"
+ "fmla z22.h, p4/M, z25.h, z3.h\n"
+ "fmla z11.h, p4/M, z24.h, z0.h\n"
+ "fmla z15.h, p4/M, z24.h, z1.h\n"
+ "fmla z19.h, p4/M, z24.h, z2.h\n"
+ "fmla z23.h, p4/M, z24.h, z3.h\n"
"bne 42b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #1\n"
@@ -709,41 +709,41 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 47f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z25.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmin z16.h, p4/M, z16.h, z1.h\n"
- "fmin z17.h, p4/M, z17.h, z1.h\n"
- "fmin z18.h, p4/M, z18.h, z1.h\n"
- "fmin z19.h, p4/M, z19.h, z1.h\n"
- "fmin z20.h, p4/M, z20.h, z1.h\n"
- "fmin z21.h, p4/M, z21.h, z1.h\n"
- "fmin z22.h, p4/M, z22.h, z1.h\n"
- "fmin z23.h, p4/M, z23.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
- "fmax z16.h, p4/M, z16.h, z0.h\n"
- "fmax z17.h, p4/M, z17.h, z0.h\n"
- "fmax z18.h, p4/M, z18.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z0.h\n"
- "fmax z20.h, p4/M, z20.h, z0.h\n"
- "fmax z21.h, p4/M, z21.h, z0.h\n"
- "fmax z22.h, p4/M, z22.h, z0.h\n"
- "fmax z23.h, p4/M, z23.h, z0.h\n"
+ "ld1rh { z24.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z25.h\n"
+ "fmin z9.h, p4/M, z9.h, z25.h\n"
+ "fmin z10.h, p4/M, z10.h, z25.h\n"
+ "fmin z11.h, p4/M, z11.h, z25.h\n"
+ "fmin z12.h, p4/M, z12.h, z25.h\n"
+ "fmin z13.h, p4/M, z13.h, z25.h\n"
+ "fmin z14.h, p4/M, z14.h, z25.h\n"
+ "fmin z15.h, p4/M, z15.h, z25.h\n"
+ "fmin z16.h, p4/M, z16.h, z25.h\n"
+ "fmin z17.h, p4/M, z17.h, z25.h\n"
+ "fmin z18.h, p4/M, z18.h, z25.h\n"
+ "fmin z19.h, p4/M, z19.h, z25.h\n"
+ "fmin z20.h, p4/M, z20.h, z25.h\n"
+ "fmin z21.h, p4/M, z21.h, z25.h\n"
+ "fmin z22.h, p4/M, z22.h, z25.h\n"
+ "fmin z23.h, p4/M, z23.h, z25.h\n"
+ "fmax z8.h, p4/M, z8.h, z24.h\n"
+ "fmax z9.h, p4/M, z9.h, z24.h\n"
+ "fmax z10.h, p4/M, z10.h, z24.h\n"
+ "fmax z11.h, p4/M, z11.h, z24.h\n"
+ "fmax z12.h, p4/M, z12.h, z24.h\n"
+ "fmax z13.h, p4/M, z13.h, z24.h\n"
+ "fmax z14.h, p4/M, z14.h, z24.h\n"
+ "fmax z15.h, p4/M, z15.h, z24.h\n"
+ "fmax z16.h, p4/M, z16.h, z24.h\n"
+ "fmax z17.h, p4/M, z17.h, z24.h\n"
+ "fmax z18.h, p4/M, z18.h, z24.h\n"
+ "fmax z19.h, p4/M, z19.h, z24.h\n"
+ "fmax z20.h, p4/M, z20.h, z24.h\n"
+ "fmax z21.h, p4/M, z21.h, z24.h\n"
+ "fmax z22.h, p4/M, z22.h, z24.h\n"
+ "fmax z23.h, p4/M, z23.h, z24.h\n"
"47:" // Height 4: No activation
"st1h { z8.h }, p3, [x9]\n"
"st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -807,30 +807,30 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"51:" // Height 5: no bias
"tbz %x[flags], #0, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p3/Z, [x9]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x23, x9, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "ld1h { z8.h }, p3/Z, [x9]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p3/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p3/Z, [x22]\n"
- "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x23]\n"
+ "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x22]\n"
+ "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x21]\n"
+ "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z24.h }, p3/Z, [x20]\n"
+ "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 53f\n"
"52:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -858,15 +858,15 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"54:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 55f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -877,10 +877,10 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"b 56f\n"
"55:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"56:" // Height 5: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -902,29 +902,29 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x24, x24, #0x2\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x2\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"add x22, x22, #0x2\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
- "fmla z26.h, p4/M, z6.h, z4.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
+ "fmla z10.h, p4/M, z29.h, z0.h\n"
+ "fmla z14.h, p4/M, z29.h, z1.h\n"
+ "fmla z18.h, p4/M, z29.h, z2.h\n"
+ "fmla z22.h, p4/M, z29.h, z3.h\n"
+ "fmla z26.h, p4/M, z29.h, z4.h\n"
+ "fmla z11.h, p4/M, z28.h, z0.h\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
"ld1h { z6.h }, p4/Z, [x10]\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
+ "fmla z15.h, p4/M, z28.h, z1.h\n"
+ "fmla z19.h, p4/M, z28.h, z2.h\n"
"ld1rh { z1.h }, p4/Z, [x25]\n"
"ld1rh { z2.h }, p4/Z, [x24]\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
- "fmla z27.h, p4/M, z7.h, z4.h\n"
+ "fmla z23.h, p4/M, z28.h, z3.h\n"
+ "fmla z27.h, p4/M, z28.h, z4.h\n"
"ld1rh { z3.h }, p4/Z, [x23]\n"
"ld1rh { z4.h }, p4/Z, [x22]\n"
"ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -939,23 +939,23 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"cmp x28, x20\n"
"fmla z24.h, p4/M, z6.h, z4.h\n"
"fmla z9.h, p4/M, z7.h, z0.h\n"
- "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, p4/M, z7.h, z1.h\n"
"fmla z17.h, p4/M, z7.h, z2.h\n"
"fmla z21.h, p4/M, z7.h, z3.h\n"
"fmla z25.h, p4/M, z7.h, z4.h\n"
- "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, p4/M, z6.h, z0.h\n"
- "fmla z14.h, p4/M, z6.h, z1.h\n"
- "fmla z18.h, p4/M, z6.h, z2.h\n"
- "fmla z22.h, p4/M, z6.h, z3.h\n"
- "fmla z26.h, p4/M, z6.h, z4.h\n"
- "fmla z11.h, p4/M, z7.h, z0.h\n"
- "fmla z15.h, p4/M, z7.h, z1.h\n"
- "fmla z19.h, p4/M, z7.h, z2.h\n"
- "fmla z23.h, p4/M, z7.h, z3.h\n"
- "fmla z27.h, p4/M, z7.h, z4.h\n"
+ "fmla z10.h, p4/M, z29.h, z0.h\n"
+ "fmla z14.h, p4/M, z29.h, z1.h\n"
+ "fmla z18.h, p4/M, z29.h, z2.h\n"
+ "fmla z22.h, p4/M, z29.h, z3.h\n"
+ "fmla z26.h, p4/M, z29.h, z4.h\n"
+ "fmla z11.h, p4/M, z28.h, z0.h\n"
+ "fmla z15.h, p4/M, z28.h, z1.h\n"
+ "fmla z19.h, p4/M, z28.h, z2.h\n"
+ "fmla z23.h, p4/M, z28.h, z3.h\n"
+ "fmla z27.h, p4/M, z28.h, z4.h\n"
"bne 54b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #1\n"
@@ -964,49 +964,49 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 59f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p4/Z, [x20]\n"
+ "ld1rh { z29.h }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p4/Z, [x20]\n"
- "fmin z8.h, p4/M, z8.h, z1.h\n"
- "fmin z9.h, p4/M, z9.h, z1.h\n"
- "fmin z10.h, p4/M, z10.h, z1.h\n"
- "fmin z11.h, p4/M, z11.h, z1.h\n"
- "fmin z12.h, p4/M, z12.h, z1.h\n"
- "fmin z13.h, p4/M, z13.h, z1.h\n"
- "fmin z14.h, p4/M, z14.h, z1.h\n"
- "fmin z15.h, p4/M, z15.h, z1.h\n"
- "fmin z16.h, p4/M, z16.h, z1.h\n"
- "fmin z17.h, p4/M, z17.h, z1.h\n"
- "fmin z18.h, p4/M, z18.h, z1.h\n"
- "fmin z19.h, p4/M, z19.h, z1.h\n"
- "fmin z20.h, p4/M, z20.h, z1.h\n"
- "fmin z21.h, p4/M, z21.h, z1.h\n"
- "fmin z22.h, p4/M, z22.h, z1.h\n"
- "fmin z23.h, p4/M, z23.h, z1.h\n"
- "fmin z24.h, p4/M, z24.h, z1.h\n"
- "fmin z25.h, p4/M, z25.h, z1.h\n"
- "fmin z26.h, p4/M, z26.h, z1.h\n"
- "fmin z27.h, p4/M, z27.h, z1.h\n"
- "fmax z8.h, p4/M, z8.h, z0.h\n"
- "fmax z9.h, p4/M, z9.h, z0.h\n"
- "fmax z10.h, p4/M, z10.h, z0.h\n"
- "fmax z11.h, p4/M, z11.h, z0.h\n"
- "fmax z12.h, p4/M, z12.h, z0.h\n"
- "fmax z13.h, p4/M, z13.h, z0.h\n"
- "fmax z14.h, p4/M, z14.h, z0.h\n"
- "fmax z15.h, p4/M, z15.h, z0.h\n"
- "fmax z16.h, p4/M, z16.h, z0.h\n"
- "fmax z17.h, p4/M, z17.h, z0.h\n"
- "fmax z18.h, p4/M, z18.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z0.h\n"
- "fmax z20.h, p4/M, z20.h, z0.h\n"
- "fmax z21.h, p4/M, z21.h, z0.h\n"
- "fmax z22.h, p4/M, z22.h, z0.h\n"
- "fmax z23.h, p4/M, z23.h, z0.h\n"
- "fmax z24.h, p4/M, z24.h, z0.h\n"
- "fmax z25.h, p4/M, z25.h, z0.h\n"
- "fmax z26.h, p4/M, z26.h, z0.h\n"
- "fmax z27.h, p4/M, z27.h, z0.h\n"
+ "ld1rh { z28.h }, p4/Z, [x20]\n"
+ "fmin z8.h, p4/M, z8.h, z29.h\n"
+ "fmin z9.h, p4/M, z9.h, z29.h\n"
+ "fmin z10.h, p4/M, z10.h, z29.h\n"
+ "fmin z11.h, p4/M, z11.h, z29.h\n"
+ "fmin z12.h, p4/M, z12.h, z29.h\n"
+ "fmin z13.h, p4/M, z13.h, z29.h\n"
+ "fmin z14.h, p4/M, z14.h, z29.h\n"
+ "fmin z15.h, p4/M, z15.h, z29.h\n"
+ "fmin z16.h, p4/M, z16.h, z29.h\n"
+ "fmin z17.h, p4/M, z17.h, z29.h\n"
+ "fmin z18.h, p4/M, z18.h, z29.h\n"
+ "fmin z19.h, p4/M, z19.h, z29.h\n"
+ "fmin z20.h, p4/M, z20.h, z29.h\n"
+ "fmin z21.h, p4/M, z21.h, z29.h\n"
+ "fmin z22.h, p4/M, z22.h, z29.h\n"
+ "fmin z23.h, p4/M, z23.h, z29.h\n"
+ "fmin z24.h, p4/M, z24.h, z29.h\n"
+ "fmin z25.h, p4/M, z25.h, z29.h\n"
+ "fmin z26.h, p4/M, z26.h, z29.h\n"
+ "fmin z27.h, p4/M, z27.h, z29.h\n"
+ "fmax z8.h, p4/M, z8.h, z28.h\n"
+ "fmax z9.h, p4/M, z9.h, z28.h\n"
+ "fmax z10.h, p4/M, z10.h, z28.h\n"
+ "fmax z11.h, p4/M, z11.h, z28.h\n"
+ "fmax z12.h, p4/M, z12.h, z28.h\n"
+ "fmax z13.h, p4/M, z13.h, z28.h\n"
+ "fmax z14.h, p4/M, z14.h, z28.h\n"
+ "fmax z15.h, p4/M, z15.h, z28.h\n"
+ "fmax z16.h, p4/M, z16.h, z28.h\n"
+ "fmax z17.h, p4/M, z17.h, z28.h\n"
+ "fmax z18.h, p4/M, z18.h, z28.h\n"
+ "fmax z19.h, p4/M, z19.h, z28.h\n"
+ "fmax z20.h, p4/M, z20.h, z28.h\n"
+ "fmax z21.h, p4/M, z21.h, z28.h\n"
+ "fmax z22.h, p4/M, z22.h, z28.h\n"
+ "fmax z23.h, p4/M, z23.h, z28.h\n"
+ "fmax z24.h, p4/M, z24.h, z28.h\n"
+ "fmax z25.h, p4/M, z25.h, z28.h\n"
+ "fmax z26.h, p4/M, z26.h, z28.h\n"
+ "fmax z27.h, p4/M, z27.h, z28.h\n"
"59:" // Height 5: No activation
"st1h { z8.h }, p3, [x9]\n"
"st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -1081,35 +1081,35 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"63:" // Height 6: no bias
"tbz %x[flags], #0, 64f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p3/Z, [x9]\n"
+ "add x24, x9, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "ld1h { z8.h }, p3/Z, [x9]\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p3/Z, [x25]\n"
- "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p3/Z, [x24]\n"
- "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p3/Z, [x23]\n"
- "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p3/Z, [x22]\n"
- "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1h { z28.h }, p3/Z, [x21]\n"
- "ld1h { z29.h }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1h { z30.h }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1h { z31.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z12.h }, p3/Z, [x24]\n"
+ "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x23]\n"
+ "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x22]\n"
+ "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z24.h }, p3/Z, [x21]\n"
+ "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z28.h }, p3/Z, [x20]\n"
+ "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
"b 65f\n"
"64:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1141,16 +1141,16 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"66:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 67f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 68f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1162,11 +1162,11 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"b 68f\n"
"67:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"68:" // Height 6: input setup done
"subs x27, x27, #0x1\n"
"ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -1355,7 +1355,6 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"74:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1363,4 +1362,4 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
index 6f0b3e0008..041825df6b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -139,11 +139,11 @@ void sve_hybrid_fp16_mla_6x4VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -156,164 +156,164 @@ void sve_hybrid_fp16_mla_6x4VL (
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z8.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z10.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z11.h, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[2]\n"
+ "fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[2]\n"
+ "fmla z11.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[3]\n"
+ "fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[3]\n"
+ "fmla z11.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[4]\n"
+ "fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[4]\n"
+ "fmla z11.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[5]\n"
+ "fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[5]\n"
+ "fmla z11.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[6]\n"
+ "fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[6]\n"
+ "fmla z11.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[7]\n"
+ "fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x8\n"
"cmp x27, #0x8\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z10.h, z17.h, z0.h[7]\n"
+ "fmla z11.h, z16.h, z0.h[7]\n"
"add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[0]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[1]\n"
+ "fmla z9.h, z16.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z10.h, z17.h, z0.h[1]\n"
+ "fmla z11.h, z16.h, z0.h[1]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[2]\n"
+ "fmla z9.h, z16.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z10.h, z17.h, z0.h[2]\n"
+ "fmla z11.h, z16.h, z0.h[2]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[3]\n"
+ "fmla z9.h, z16.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z10.h, z17.h, z0.h[3]\n"
+ "fmla z11.h, z16.h, z0.h[3]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[4]\n"
+ "fmla z9.h, z16.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z10.h, z17.h, z0.h[4]\n"
+ "fmla z11.h, z16.h, z0.h[4]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[5]\n"
+ "fmla z9.h, z16.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z10.h, z17.h, z0.h[5]\n"
+ "fmla z11.h, z16.h, z0.h[5]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[6]\n"
+ "fmla z9.h, z16.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z10.h, z17.h, z0.h[6]\n"
+ "fmla z11.h, z16.h, z0.h[6]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[7]\n"
+ "fmla z9.h, z16.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[7]\n"
+ "fmla z11.h, z16.h, z0.h[7]\n"
"addvl x10, x10, #4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -322,17 +322,17 @@ void sve_hybrid_fp16_mla_6x4VL (
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
+ "ld1rh { z16.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z17.h\n"
+ "fmin z9.h, p5/M, z9.h, z17.h\n"
+ "fmin z10.h, p5/M, z10.h, z17.h\n"
+ "fmin z11.h, p5/M, z11.h, z17.h\n"
+ "fmax z8.h, p5/M, z8.h, z16.h\n"
+ "fmax z9.h, p5/M, z9.h, z16.h\n"
+ "fmax z10.h, p5/M, z10.h, z16.h\n"
+ "fmax z11.h, p5/M, z11.h, z16.h\n"
"12:" // Height 1: No activation
"st1h { z8.h }, p4, [x9]\n"
"st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -372,15 +372,15 @@ void sve_hybrid_fp16_mla_6x4VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
+ "add x20, x9, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x20]\n"
+ "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 18f\n"
"17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -396,12 +396,12 @@ void sve_hybrid_fp16_mla_6x4VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -409,239 +409,239 @@ void sve_hybrid_fp16_mla_6x4VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
"21:" // Height 2: input setup done
"cmp x27, #0x8\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z1.h }, p0/Z, [x26]\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[0]\n"
+ "fmla z12.h, z17.h, z0.h[0]\n"
+ "fmla z9.h, z16.h, z1.h[0]\n"
+ "fmla z13.h, z16.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[0]\n"
+ "fmla z14.h, z17.h, z0.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
"cmp x27, #0x8\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[0]\n"
+ "fmla z15.h, z16.h, z0.h[0]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
"add x26, x26, #0x10\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[1]\n"
+ "fmla z12.h, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[1]\n"
+ "fmla z13.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[1]\n"
+ "fmla z14.h, z17.h, z0.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[1]\n"
+ "fmla z15.h, z16.h, z0.h[1]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[2]\n"
+ "fmla z12.h, z17.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[2]\n"
+ "fmla z13.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[2]\n"
+ "fmla z14.h, z17.h, z0.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[2]\n"
+ "fmla z15.h, z16.h, z0.h[2]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[3]\n"
+ "fmla z12.h, z17.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[3]\n"
+ "fmla z13.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[3]\n"
+ "fmla z14.h, z17.h, z0.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "fmla z11.h, z16.h, z1.h[3]\n"
+ "fmla z15.h, z16.h, z0.h[3]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[4]\n"
+ "fmla z12.h, z17.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[4]\n"
+ "fmla z13.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[4]\n"
+ "fmla z14.h, z17.h, z0.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[4]\n"
+ "fmla z15.h, z16.h, z0.h[4]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[5]\n"
+ "fmla z12.h, z17.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[5]\n"
+ "fmla z13.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z10.h, z17.h, z1.h[5]\n"
+ "fmla z14.h, z17.h, z0.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[5]\n"
+ "fmla z15.h, z16.h, z0.h[5]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[6]\n"
+ "fmla z12.h, z17.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[6]\n"
+ "fmla z13.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[6]\n"
+ "fmla z14.h, z17.h, z0.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.h, z16.h, z1.h[6]\n"
+ "fmla z15.h, z16.h, z0.h[6]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z17.h, z1.h[7]\n"
+ "fmla z12.h, z17.h, z0.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.h, z16.h, z1.h[7]\n"
+ "fmla z13.h, z16.h, z0.h[7]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z17.h, z1.h[7]\n"
+ "fmla z14.h, z17.h, z0.h[7]\n"
+ "fmla z11.h, z16.h, z1.h[7]\n"
+ "fmla z15.h, z16.h, z0.h[7]\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
"ld1rqh { z0.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[0]\n"
+ "fmla z12.h, z17.h, z1.h[0]\n"
+ "fmla z9.h, z16.h, z0.h[0]\n"
+ "fmla z13.h, z16.h, z1.h[0]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[0]\n"
+ "fmla z14.h, z17.h, z1.h[0]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z11.h, z16.h, z0.h[0]\n"
+ "fmla z15.h, z16.h, z1.h[0]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[1]\n"
+ "fmla z12.h, z17.h, z1.h[1]\n"
+ "fmla z9.h, z16.h, z0.h[1]\n"
+ "fmla z13.h, z16.h, z1.h[1]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z10.h, z17.h, z0.h[1]\n"
+ "fmla z14.h, z17.h, z1.h[1]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z11.h, z16.h, z0.h[1]\n"
+ "fmla z15.h, z16.h, z1.h[1]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[2]\n"
+ "fmla z12.h, z17.h, z1.h[2]\n"
+ "fmla z9.h, z16.h, z0.h[2]\n"
+ "fmla z13.h, z16.h, z1.h[2]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z10.h, z17.h, z0.h[2]\n"
+ "fmla z14.h, z17.h, z1.h[2]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z11.h, z16.h, z0.h[2]\n"
+ "fmla z15.h, z16.h, z1.h[2]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[3]\n"
+ "fmla z12.h, z17.h, z1.h[3]\n"
+ "fmla z9.h, z16.h, z0.h[3]\n"
+ "fmla z13.h, z16.h, z1.h[3]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z10.h, z17.h, z0.h[3]\n"
+ "fmla z14.h, z17.h, z1.h[3]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z11.h, z16.h, z0.h[3]\n"
+ "fmla z15.h, z16.h, z1.h[3]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[4]\n"
+ "fmla z12.h, z17.h, z1.h[4]\n"
+ "fmla z9.h, z16.h, z0.h[4]\n"
+ "fmla z13.h, z16.h, z1.h[4]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z10.h, z17.h, z0.h[4]\n"
+ "fmla z14.h, z17.h, z1.h[4]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z11.h, z16.h, z0.h[4]\n"
+ "fmla z15.h, z16.h, z1.h[4]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[5]\n"
+ "fmla z12.h, z17.h, z1.h[5]\n"
+ "fmla z9.h, z16.h, z0.h[5]\n"
+ "fmla z13.h, z16.h, z1.h[5]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z10.h, z17.h, z0.h[5]\n"
+ "fmla z14.h, z17.h, z1.h[5]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z11.h, z16.h, z0.h[5]\n"
+ "fmla z15.h, z16.h, z1.h[5]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[6]\n"
+ "fmla z12.h, z17.h, z1.h[6]\n"
+ "fmla z9.h, z16.h, z0.h[6]\n"
+ "fmla z13.h, z16.h, z1.h[6]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z10.h, z17.h, z0.h[6]\n"
+ "fmla z14.h, z17.h, z1.h[6]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z11.h, z16.h, z0.h[6]\n"
+ "fmla z15.h, z16.h, z1.h[6]\n"
"ble 24f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z17.h, z0.h[7]\n"
+ "fmla z12.h, z17.h, z1.h[7]\n"
+ "fmla z9.h, z16.h, z0.h[7]\n"
+ "fmla z13.h, z16.h, z1.h[7]\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z17.h, z0.h[7]\n"
+ "fmla z14.h, z17.h, z1.h[7]\n"
"addvl x10, x10, #4\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z11.h, z16.h, z0.h[7]\n"
+ "fmla z15.h, z16.h, z1.h[7]\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -651,25 +651,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"add x25, x9, x20, LSL #1\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z17.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
+ "ld1rh { z16.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z17.h\n"
+ "fmin z9.h, p5/M, z9.h, z17.h\n"
+ "fmin z10.h, p5/M, z10.h, z17.h\n"
+ "fmin z11.h, p5/M, z11.h, z17.h\n"
+ "fmin z12.h, p5/M, z12.h, z17.h\n"
+ "fmin z13.h, p5/M, z13.h, z17.h\n"
+ "fmin z14.h, p5/M, z14.h, z17.h\n"
+ "fmin z15.h, p5/M, z15.h, z17.h\n"
+ "fmax z8.h, p5/M, z8.h, z16.h\n"
+ "fmax z9.h, p5/M, z9.h, z16.h\n"
+ "fmax z10.h, p5/M, z10.h, z16.h\n"
+ "fmax z11.h, p5/M, z11.h, z16.h\n"
+ "fmax z12.h, p5/M, z12.h, z16.h\n"
+ "fmax z13.h, p5/M, z13.h, z16.h\n"
+ "fmax z14.h, p5/M, z14.h, z16.h\n"
+ "fmax z15.h, p5/M, z15.h, z16.h\n"
"25:" // Height 2: No activation
"st1h { z8.h }, p4, [x9]\n"
"st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -717,20 +717,20 @@ void sve_hybrid_fp16_mla_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x21, x9, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x21]\n"
+ "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x20]\n"
+ "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 31f\n"
"30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -750,13 +750,13 @@ void sve_hybrid_fp16_mla_6x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -765,151 +765,151 @@ void sve_hybrid_fp16_mla_6x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
"34:" // Height 3: input setup done
"cmp x27, #0x8\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1rqh { z0.h }, p0/Z, [x24]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z21.h, z2.h[0]\n"
+ "fmla z12.h, z21.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.h, z21.h, z0.h[0]\n"
+ "fmla z9.h, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
+ "fmla z17.h, z20.h, z0.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"cmp x27, #0x8\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z10.h, z21.h, z2.h[0]\n"
+ "fmla z14.h, z21.h, z1.h[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z18.h, z21.h, z0.h[0]\n"
+ "fmla z11.h, z20.h, z2.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[0]\n"
+ "fmla z19.h, z20.h, z0.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[1]\n"
+ "fmla z12.h, z21.h, z1.h[1]\n"
+ "fmla z16.h, z21.h, z0.h[1]\n"
+ "fmla z9.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[1]\n"
+ "fmla z17.h, z20.h, z0.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[1]\n"
+ "fmla z14.h, z21.h, z1.h[1]\n"
+ "fmla z18.h, z21.h, z0.h[1]\n"
+ "fmla z11.h, z20.h, z2.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[1]\n"
+ "fmla z19.h, z20.h, z0.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[2]\n"
+ "fmla z12.h, z21.h, z1.h[2]\n"
+ "fmla z16.h, z21.h, z0.h[2]\n"
+ "fmla z9.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[2]\n"
+ "fmla z17.h, z20.h, z0.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[2]\n"
+ "fmla z14.h, z21.h, z1.h[2]\n"
+ "fmla z18.h, z21.h, z0.h[2]\n"
+ "fmla z11.h, z20.h, z2.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[2]\n"
+ "fmla z19.h, z20.h, z0.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[3]\n"
+ "fmla z12.h, z21.h, z1.h[3]\n"
+ "fmla z16.h, z21.h, z0.h[3]\n"
+ "fmla z9.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[3]\n"
+ "fmla z17.h, z20.h, z0.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[3]\n"
+ "fmla z14.h, z21.h, z1.h[3]\n"
+ "fmla z18.h, z21.h, z0.h[3]\n"
+ "fmla z11.h, z20.h, z2.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "fmla z15.h, z20.h, z1.h[3]\n"
+ "fmla z19.h, z20.h, z0.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[4]\n"
+ "fmla z12.h, z21.h, z1.h[4]\n"
+ "fmla z16.h, z21.h, z0.h[4]\n"
+ "fmla z9.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[4]\n"
+ "fmla z17.h, z20.h, z0.h[4]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[4]\n"
+ "fmla z14.h, z21.h, z1.h[4]\n"
+ "fmla z18.h, z21.h, z0.h[4]\n"
+ "fmla z11.h, z20.h, z2.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[4]\n"
+ "fmla z19.h, z20.h, z0.h[4]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[5]\n"
+ "fmla z12.h, z21.h, z1.h[5]\n"
+ "fmla z16.h, z21.h, z0.h[5]\n"
+ "fmla z9.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[5]\n"
+ "fmla z17.h, z20.h, z0.h[5]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z10.h, z21.h, z2.h[5]\n"
+ "fmla z14.h, z21.h, z1.h[5]\n"
+ "fmla z18.h, z21.h, z0.h[5]\n"
+ "fmla z11.h, z20.h, z2.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[5]\n"
+ "fmla z19.h, z20.h, z0.h[5]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[6]\n"
+ "fmla z12.h, z21.h, z1.h[6]\n"
+ "fmla z16.h, z21.h, z0.h[6]\n"
+ "fmla z9.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[6]\n"
+ "fmla z17.h, z20.h, z0.h[6]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[6]\n"
+ "fmla z14.h, z21.h, z1.h[6]\n"
+ "fmla z18.h, z21.h, z0.h[6]\n"
+ "fmla z11.h, z20.h, z2.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z15.h, z20.h, z1.h[6]\n"
+ "fmla z19.h, z20.h, z0.h[6]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z21.h, z2.h[7]\n"
+ "fmla z12.h, z21.h, z1.h[7]\n"
+ "fmla z16.h, z21.h, z0.h[7]\n"
+ "fmla z9.h, z20.h, z2.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[7]\n"
+ "fmla z17.h, z20.h, z0.h[7]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z21.h, z2.h[7]\n"
+ "fmla z14.h, z21.h, z1.h[7]\n"
+ "fmla z18.h, z21.h, z0.h[7]\n"
+ "fmla z11.h, z20.h, z2.h[7]\n"
+ "fmla z15.h, z20.h, z1.h[7]\n"
+ "fmla z19.h, z20.h, z0.h[7]\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -917,155 +917,155 @@ void sve_hybrid_fp16_mla_6x4VL (
"ld1rqh { z1.h }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z21.h, z0.h[0]\n"
+ "fmla z12.h, z21.h, z1.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.h, z21.h, z2.h[0]\n"
+ "fmla z9.h, z20.h, z0.h[0]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[0]\n"
+ "fmla z17.h, z20.h, z2.h[0]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z10.h, z21.h, z0.h[0]\n"
+ "fmla z14.h, z21.h, z1.h[0]\n"
+ "fmla z18.h, z21.h, z2.h[0]\n"
+ "fmla z11.h, z20.h, z0.h[0]\n"
+ "fmla z15.h, z20.h, z1.h[0]\n"
+ "fmla z19.h, z20.h, z2.h[0]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[1]\n"
+ "fmla z12.h, z21.h, z1.h[1]\n"
+ "fmla z16.h, z21.h, z2.h[1]\n"
+ "fmla z9.h, z20.h, z0.h[1]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[1]\n"
+ "fmla z17.h, z20.h, z2.h[1]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z10.h, z21.h, z0.h[1]\n"
+ "fmla z14.h, z21.h, z1.h[1]\n"
+ "fmla z18.h, z21.h, z2.h[1]\n"
+ "fmla z11.h, z20.h, z0.h[1]\n"
+ "fmla z15.h, z20.h, z1.h[1]\n"
+ "fmla z19.h, z20.h, z2.h[1]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[2]\n"
+ "fmla z12.h, z21.h, z1.h[2]\n"
+ "fmla z16.h, z21.h, z2.h[2]\n"
+ "fmla z9.h, z20.h, z0.h[2]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[2]\n"
+ "fmla z17.h, z20.h, z2.h[2]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z10.h, z21.h, z0.h[2]\n"
+ "fmla z14.h, z21.h, z1.h[2]\n"
+ "fmla z18.h, z21.h, z2.h[2]\n"
+ "fmla z11.h, z20.h, z0.h[2]\n"
+ "fmla z15.h, z20.h, z1.h[2]\n"
+ "fmla z19.h, z20.h, z2.h[2]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[3]\n"
+ "fmla z12.h, z21.h, z1.h[3]\n"
+ "fmla z16.h, z21.h, z2.h[3]\n"
+ "fmla z9.h, z20.h, z0.h[3]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[3]\n"
+ "fmla z17.h, z20.h, z2.h[3]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z10.h, z21.h, z0.h[3]\n"
+ "fmla z14.h, z21.h, z1.h[3]\n"
+ "fmla z18.h, z21.h, z2.h[3]\n"
+ "fmla z11.h, z20.h, z0.h[3]\n"
+ "fmla z15.h, z20.h, z1.h[3]\n"
+ "fmla z19.h, z20.h, z2.h[3]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[4]\n"
+ "fmla z12.h, z21.h, z1.h[4]\n"
+ "fmla z16.h, z21.h, z2.h[4]\n"
+ "fmla z9.h, z20.h, z0.h[4]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[4]\n"
+ "fmla z17.h, z20.h, z2.h[4]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z10.h, z21.h, z0.h[4]\n"
+ "fmla z14.h, z21.h, z1.h[4]\n"
+ "fmla z18.h, z21.h, z2.h[4]\n"
+ "fmla z11.h, z20.h, z0.h[4]\n"
+ "fmla z15.h, z20.h, z1.h[4]\n"
+ "fmla z19.h, z20.h, z2.h[4]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[5]\n"
+ "fmla z12.h, z21.h, z1.h[5]\n"
+ "fmla z16.h, z21.h, z2.h[5]\n"
+ "fmla z9.h, z20.h, z0.h[5]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[5]\n"
+ "fmla z17.h, z20.h, z2.h[5]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z10.h, z21.h, z0.h[5]\n"
+ "fmla z14.h, z21.h, z1.h[5]\n"
+ "fmla z18.h, z21.h, z2.h[5]\n"
+ "fmla z11.h, z20.h, z0.h[5]\n"
+ "fmla z15.h, z20.h, z1.h[5]\n"
+ "fmla z19.h, z20.h, z2.h[5]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[6]\n"
+ "fmla z12.h, z21.h, z1.h[6]\n"
+ "fmla z16.h, z21.h, z2.h[6]\n"
+ "fmla z9.h, z20.h, z0.h[6]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[6]\n"
+ "fmla z17.h, z20.h, z2.h[6]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z10.h, z21.h, z0.h[6]\n"
+ "fmla z14.h, z21.h, z1.h[6]\n"
+ "fmla z18.h, z21.h, z2.h[6]\n"
+ "fmla z11.h, z20.h, z0.h[6]\n"
+ "fmla z15.h, z20.h, z1.h[6]\n"
+ "fmla z19.h, z20.h, z2.h[6]\n"
"ble 37f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z21.h }, p5/Z, [x10]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z21.h, z0.h[7]\n"
+ "fmla z12.h, z21.h, z1.h[7]\n"
+ "fmla z16.h, z21.h, z2.h[7]\n"
+ "fmla z9.h, z20.h, z0.h[7]\n"
+ "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z20.h, z1.h[7]\n"
+ "fmla z17.h, z20.h, z2.h[7]\n"
+ "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z10.h, z21.h, z0.h[7]\n"
+ "fmla z14.h, z21.h, z1.h[7]\n"
+ "fmla z18.h, z21.h, z2.h[7]\n"
+ "fmla z11.h, z20.h, z0.h[7]\n"
+ "fmla z15.h, z20.h, z1.h[7]\n"
+ "fmla z19.h, z20.h, z2.h[7]\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1076,33 +1076,33 @@ void sve_hybrid_fp16_mla_6x4VL (
"add x24, x25, x20, LSL #1\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z21.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmin z16.h, p5/M, z16.h, z1.h\n"
- "fmin z17.h, p5/M, z17.h, z1.h\n"
- "fmin z18.h, p5/M, z18.h, z1.h\n"
- "fmin z19.h, p5/M, z19.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
- "fmax z16.h, p5/M, z16.h, z0.h\n"
- "fmax z17.h, p5/M, z17.h, z0.h\n"
- "fmax z18.h, p5/M, z18.h, z0.h\n"
- "fmax z19.h, p5/M, z19.h, z0.h\n"
+ "ld1rh { z20.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z21.h\n"
+ "fmin z9.h, p5/M, z9.h, z21.h\n"
+ "fmin z10.h, p5/M, z10.h, z21.h\n"
+ "fmin z11.h, p5/M, z11.h, z21.h\n"
+ "fmin z12.h, p5/M, z12.h, z21.h\n"
+ "fmin z13.h, p5/M, z13.h, z21.h\n"
+ "fmin z14.h, p5/M, z14.h, z21.h\n"
+ "fmin z15.h, p5/M, z15.h, z21.h\n"
+ "fmin z16.h, p5/M, z16.h, z21.h\n"
+ "fmin z17.h, p5/M, z17.h, z21.h\n"
+ "fmin z18.h, p5/M, z18.h, z21.h\n"
+ "fmin z19.h, p5/M, z19.h, z21.h\n"
+ "fmax z8.h, p5/M, z8.h, z20.h\n"
+ "fmax z9.h, p5/M, z9.h, z20.h\n"
+ "fmax z10.h, p5/M, z10.h, z20.h\n"
+ "fmax z11.h, p5/M, z11.h, z20.h\n"
+ "fmax z12.h, p5/M, z12.h, z20.h\n"
+ "fmax z13.h, p5/M, z13.h, z20.h\n"
+ "fmax z14.h, p5/M, z14.h, z20.h\n"
+ "fmax z15.h, p5/M, z15.h, z20.h\n"
+ "fmax z16.h, p5/M, z16.h, z20.h\n"
+ "fmax z17.h, p5/M, z17.h, z20.h\n"
+ "fmax z18.h, p5/M, z18.h, z20.h\n"
+ "fmax z19.h, p5/M, z19.h, z20.h\n"
"38:" // Height 3: No activation
"st1h { z8.h }, p4, [x9]\n"
"st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -1158,25 +1158,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
+ "add x22, x9, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z8.h }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23]\n"
- "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x22]\n"
+ "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x21]\n"
+ "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x20]\n"
+ "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 44f\n"
"43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -1200,14 +1200,14 @@ void sve_hybrid_fp16_mla_6x4VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1217,186 +1217,186 @@ void sve_hybrid_fp16_mla_6x4VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
"47:" // Height 4: input setup done
"cmp x27, #0x8\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z3.h }, p0/Z, [x26]\n"
+ "ld1rqh { z2.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "ld1rqh { z0.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[0]\n"
+ "fmla z12.h, z25.h, z2.h[0]\n"
+ "fmla z16.h, z25.h, z1.h[0]\n"
+ "fmla z20.h, z25.h, z0.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z9.h, z24.h, z3.h[0]\n"
+ "fmla z13.h, z24.h, z2.h[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z17.h, z24.h, z1.h[0]\n"
+ "fmla z21.h, z24.h, z0.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[0]\n"
+ "fmla z14.h, z25.h, z2.h[0]\n"
+ "fmla z18.h, z25.h, z1.h[0]\n"
+ "fmla z22.h, z25.h, z0.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[0]\n"
+ "fmla z15.h, z24.h, z2.h[0]\n"
+ "fmla z19.h, z24.h, z1.h[0]\n"
+ "fmla z23.h, z24.h, z0.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[1]\n"
+ "fmla z12.h, z25.h, z2.h[1]\n"
+ "fmla z16.h, z25.h, z1.h[1]\n"
+ "fmla z20.h, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[1]\n"
+ "fmla z13.h, z24.h, z2.h[1]\n"
+ "fmla z17.h, z24.h, z1.h[1]\n"
+ "fmla z21.h, z24.h, z0.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[1]\n"
+ "fmla z14.h, z25.h, z2.h[1]\n"
+ "fmla z18.h, z25.h, z1.h[1]\n"
+ "fmla z22.h, z25.h, z0.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[1]\n"
+ "fmla z15.h, z24.h, z2.h[1]\n"
+ "fmla z19.h, z24.h, z1.h[1]\n"
+ "fmla z23.h, z24.h, z0.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[2]\n"
+ "fmla z12.h, z25.h, z2.h[2]\n"
+ "fmla z16.h, z25.h, z1.h[2]\n"
+ "fmla z20.h, z25.h, z0.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[2]\n"
+ "fmla z13.h, z24.h, z2.h[2]\n"
+ "fmla z17.h, z24.h, z1.h[2]\n"
+ "fmla z21.h, z24.h, z0.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[2]\n"
+ "fmla z14.h, z25.h, z2.h[2]\n"
+ "fmla z18.h, z25.h, z1.h[2]\n"
+ "fmla z22.h, z25.h, z0.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[2]\n"
+ "fmla z15.h, z24.h, z2.h[2]\n"
+ "fmla z19.h, z24.h, z1.h[2]\n"
+ "fmla z23.h, z24.h, z0.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[3]\n"
+ "fmla z12.h, z25.h, z2.h[3]\n"
+ "fmla z16.h, z25.h, z1.h[3]\n"
+ "fmla z20.h, z25.h, z0.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[3]\n"
+ "fmla z13.h, z24.h, z2.h[3]\n"
+ "fmla z17.h, z24.h, z1.h[3]\n"
+ "fmla z21.h, z24.h, z0.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[3]\n"
+ "fmla z14.h, z25.h, z2.h[3]\n"
+ "fmla z18.h, z25.h, z1.h[3]\n"
+ "fmla z22.h, z25.h, z0.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "fmla z11.h, z24.h, z3.h[3]\n"
+ "fmla z15.h, z24.h, z2.h[3]\n"
+ "fmla z19.h, z24.h, z1.h[3]\n"
+ "fmla z23.h, z24.h, z0.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[4]\n"
+ "fmla z12.h, z25.h, z2.h[4]\n"
+ "fmla z16.h, z25.h, z1.h[4]\n"
+ "fmla z20.h, z25.h, z0.h[4]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[4]\n"
+ "fmla z13.h, z24.h, z2.h[4]\n"
+ "fmla z17.h, z24.h, z1.h[4]\n"
+ "fmla z21.h, z24.h, z0.h[4]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[4]\n"
+ "fmla z14.h, z25.h, z2.h[4]\n"
+ "fmla z18.h, z25.h, z1.h[4]\n"
+ "fmla z22.h, z25.h, z0.h[4]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[4]\n"
+ "fmla z15.h, z24.h, z2.h[4]\n"
+ "fmla z19.h, z24.h, z1.h[4]\n"
+ "fmla z23.h, z24.h, z0.h[4]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[5]\n"
+ "fmla z12.h, z25.h, z2.h[5]\n"
+ "fmla z16.h, z25.h, z1.h[5]\n"
+ "fmla z20.h, z25.h, z0.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[5]\n"
+ "fmla z13.h, z24.h, z2.h[5]\n"
+ "fmla z17.h, z24.h, z1.h[5]\n"
+ "fmla z21.h, z24.h, z0.h[5]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z10.h, z25.h, z3.h[5]\n"
+ "fmla z14.h, z25.h, z2.h[5]\n"
+ "fmla z18.h, z25.h, z1.h[5]\n"
+ "fmla z22.h, z25.h, z0.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[5]\n"
+ "fmla z15.h, z24.h, z2.h[5]\n"
+ "fmla z19.h, z24.h, z1.h[5]\n"
+ "fmla z23.h, z24.h, z0.h[5]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[6]\n"
+ "fmla z12.h, z25.h, z2.h[6]\n"
+ "fmla z16.h, z25.h, z1.h[6]\n"
+ "fmla z20.h, z25.h, z0.h[6]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[6]\n"
+ "fmla z13.h, z24.h, z2.h[6]\n"
+ "fmla z17.h, z24.h, z1.h[6]\n"
+ "fmla z21.h, z24.h, z0.h[6]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[6]\n"
+ "fmla z14.h, z25.h, z2.h[6]\n"
+ "fmla z18.h, z25.h, z1.h[6]\n"
+ "fmla z22.h, z25.h, z0.h[6]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.h, z24.h, z3.h[6]\n"
+ "fmla z15.h, z24.h, z2.h[6]\n"
+ "fmla z19.h, z24.h, z1.h[6]\n"
+ "fmla z23.h, z24.h, z0.h[6]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z25.h, z3.h[7]\n"
+ "fmla z12.h, z25.h, z2.h[7]\n"
+ "fmla z16.h, z25.h, z1.h[7]\n"
+ "fmla z20.h, z25.h, z0.h[7]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.h, z24.h, z3.h[7]\n"
+ "fmla z13.h, z24.h, z2.h[7]\n"
+ "fmla z17.h, z24.h, z1.h[7]\n"
+ "fmla z21.h, z24.h, z0.h[7]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z25.h, z3.h[7]\n"
+ "fmla z14.h, z25.h, z2.h[7]\n"
+ "fmla z18.h, z25.h, z1.h[7]\n"
+ "fmla z22.h, z25.h, z0.h[7]\n"
+ "fmla z11.h, z24.h, z3.h[7]\n"
+ "fmla z15.h, z24.h, z2.h[7]\n"
+ "fmla z19.h, z24.h, z1.h[7]\n"
+ "fmla z23.h, z24.h, z0.h[7]\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -1405,187 +1405,187 @@ void sve_hybrid_fp16_mla_6x4VL (
"subs x27, x27, #0x1\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[0]\n"
+ "fmla z12.h, z25.h, z1.h[0]\n"
+ "fmla z16.h, z25.h, z2.h[0]\n"
+ "fmla z20.h, z25.h, z3.h[0]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[0]\n"
+ "fmla z13.h, z24.h, z1.h[0]\n"
+ "fmla z17.h, z24.h, z2.h[0]\n"
+ "fmla z21.h, z24.h, z3.h[0]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z10.h, z25.h, z0.h[0]\n"
+ "fmla z14.h, z25.h, z1.h[0]\n"
+ "fmla z18.h, z25.h, z2.h[0]\n"
+ "fmla z22.h, z25.h, z3.h[0]\n"
+ "fmla z11.h, z24.h, z0.h[0]\n"
+ "fmla z15.h, z24.h, z1.h[0]\n"
+ "fmla z19.h, z24.h, z2.h[0]\n"
+ "fmla z23.h, z24.h, z3.h[0]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[1]\n"
+ "fmla z12.h, z25.h, z1.h[1]\n"
+ "fmla z16.h, z25.h, z2.h[1]\n"
+ "fmla z20.h, z25.h, z3.h[1]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[1]\n"
+ "fmla z13.h, z24.h, z1.h[1]\n"
+ "fmla z17.h, z24.h, z2.h[1]\n"
+ "fmla z21.h, z24.h, z3.h[1]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z10.h, z25.h, z0.h[1]\n"
+ "fmla z14.h, z25.h, z1.h[1]\n"
+ "fmla z18.h, z25.h, z2.h[1]\n"
+ "fmla z22.h, z25.h, z3.h[1]\n"
+ "fmla z11.h, z24.h, z0.h[1]\n"
+ "fmla z15.h, z24.h, z1.h[1]\n"
+ "fmla z19.h, z24.h, z2.h[1]\n"
+ "fmla z23.h, z24.h, z3.h[1]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[2]\n"
+ "fmla z12.h, z25.h, z1.h[2]\n"
+ "fmla z16.h, z25.h, z2.h[2]\n"
+ "fmla z20.h, z25.h, z3.h[2]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[2]\n"
+ "fmla z13.h, z24.h, z1.h[2]\n"
+ "fmla z17.h, z24.h, z2.h[2]\n"
+ "fmla z21.h, z24.h, z3.h[2]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z10.h, z25.h, z0.h[2]\n"
+ "fmla z14.h, z25.h, z1.h[2]\n"
+ "fmla z18.h, z25.h, z2.h[2]\n"
+ "fmla z22.h, z25.h, z3.h[2]\n"
+ "fmla z11.h, z24.h, z0.h[2]\n"
+ "fmla z15.h, z24.h, z1.h[2]\n"
+ "fmla z19.h, z24.h, z2.h[2]\n"
+ "fmla z23.h, z24.h, z3.h[2]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[3]\n"
+ "fmla z12.h, z25.h, z1.h[3]\n"
+ "fmla z16.h, z25.h, z2.h[3]\n"
+ "fmla z20.h, z25.h, z3.h[3]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[3]\n"
+ "fmla z13.h, z24.h, z1.h[3]\n"
+ "fmla z17.h, z24.h, z2.h[3]\n"
+ "fmla z21.h, z24.h, z3.h[3]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z10.h, z25.h, z0.h[3]\n"
+ "fmla z14.h, z25.h, z1.h[3]\n"
+ "fmla z18.h, z25.h, z2.h[3]\n"
+ "fmla z22.h, z25.h, z3.h[3]\n"
+ "fmla z11.h, z24.h, z0.h[3]\n"
+ "fmla z15.h, z24.h, z1.h[3]\n"
+ "fmla z19.h, z24.h, z2.h[3]\n"
+ "fmla z23.h, z24.h, z3.h[3]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[4]\n"
+ "fmla z12.h, z25.h, z1.h[4]\n"
+ "fmla z16.h, z25.h, z2.h[4]\n"
+ "fmla z20.h, z25.h, z3.h[4]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[4]\n"
+ "fmla z13.h, z24.h, z1.h[4]\n"
+ "fmla z17.h, z24.h, z2.h[4]\n"
+ "fmla z21.h, z24.h, z3.h[4]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z10.h, z25.h, z0.h[4]\n"
+ "fmla z14.h, z25.h, z1.h[4]\n"
+ "fmla z18.h, z25.h, z2.h[4]\n"
+ "fmla z22.h, z25.h, z3.h[4]\n"
+ "fmla z11.h, z24.h, z0.h[4]\n"
+ "fmla z15.h, z24.h, z1.h[4]\n"
+ "fmla z19.h, z24.h, z2.h[4]\n"
+ "fmla z23.h, z24.h, z3.h[4]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[5]\n"
+ "fmla z12.h, z25.h, z1.h[5]\n"
+ "fmla z16.h, z25.h, z2.h[5]\n"
+ "fmla z20.h, z25.h, z3.h[5]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[5]\n"
+ "fmla z13.h, z24.h, z1.h[5]\n"
+ "fmla z17.h, z24.h, z2.h[5]\n"
+ "fmla z21.h, z24.h, z3.h[5]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z10.h, z25.h, z0.h[5]\n"
+ "fmla z14.h, z25.h, z1.h[5]\n"
+ "fmla z18.h, z25.h, z2.h[5]\n"
+ "fmla z22.h, z25.h, z3.h[5]\n"
+ "fmla z11.h, z24.h, z0.h[5]\n"
+ "fmla z15.h, z24.h, z1.h[5]\n"
+ "fmla z19.h, z24.h, z2.h[5]\n"
+ "fmla z23.h, z24.h, z3.h[5]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[6]\n"
+ "fmla z12.h, z25.h, z1.h[6]\n"
+ "fmla z16.h, z25.h, z2.h[6]\n"
+ "fmla z20.h, z25.h, z3.h[6]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[6]\n"
+ "fmla z13.h, z24.h, z1.h[6]\n"
+ "fmla z17.h, z24.h, z2.h[6]\n"
+ "fmla z21.h, z24.h, z3.h[6]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z10.h, z25.h, z0.h[6]\n"
+ "fmla z14.h, z25.h, z1.h[6]\n"
+ "fmla z18.h, z25.h, z2.h[6]\n"
+ "fmla z22.h, z25.h, z3.h[6]\n"
+ "fmla z11.h, z24.h, z0.h[6]\n"
+ "fmla z15.h, z24.h, z1.h[6]\n"
+ "fmla z19.h, z24.h, z2.h[6]\n"
+ "fmla z23.h, z24.h, z3.h[6]\n"
"ble 50f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z25.h, z0.h[7]\n"
+ "fmla z12.h, z25.h, z1.h[7]\n"
+ "fmla z16.h, z25.h, z2.h[7]\n"
+ "fmla z20.h, z25.h, z3.h[7]\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z24.h, z0.h[7]\n"
+ "fmla z13.h, z24.h, z1.h[7]\n"
+ "fmla z17.h, z24.h, z2.h[7]\n"
+ "fmla z21.h, z24.h, z3.h[7]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z10.h, z25.h, z0.h[7]\n"
+ "fmla z14.h, z25.h, z1.h[7]\n"
+ "fmla z18.h, z25.h, z2.h[7]\n"
+ "fmla z22.h, z25.h, z3.h[7]\n"
+ "fmla z11.h, z24.h, z0.h[7]\n"
+ "fmla z15.h, z24.h, z1.h[7]\n"
+ "fmla z19.h, z24.h, z2.h[7]\n"
+ "fmla z23.h, z24.h, z3.h[7]\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1597,41 +1597,41 @@ void sve_hybrid_fp16_mla_6x4VL (
"add x23, x24, x20, LSL #1\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z25.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmin z16.h, p5/M, z16.h, z1.h\n"
- "fmin z17.h, p5/M, z17.h, z1.h\n"
- "fmin z18.h, p5/M, z18.h, z1.h\n"
- "fmin z19.h, p5/M, z19.h, z1.h\n"
- "fmin z20.h, p5/M, z20.h, z1.h\n"
- "fmin z21.h, p5/M, z21.h, z1.h\n"
- "fmin z22.h, p5/M, z22.h, z1.h\n"
- "fmin z23.h, p5/M, z23.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
- "fmax z16.h, p5/M, z16.h, z0.h\n"
- "fmax z17.h, p5/M, z17.h, z0.h\n"
- "fmax z18.h, p5/M, z18.h, z0.h\n"
- "fmax z19.h, p5/M, z19.h, z0.h\n"
- "fmax z20.h, p5/M, z20.h, z0.h\n"
- "fmax z21.h, p5/M, z21.h, z0.h\n"
- "fmax z22.h, p5/M, z22.h, z0.h\n"
- "fmax z23.h, p5/M, z23.h, z0.h\n"
+ "ld1rh { z24.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z25.h\n"
+ "fmin z9.h, p5/M, z9.h, z25.h\n"
+ "fmin z10.h, p5/M, z10.h, z25.h\n"
+ "fmin z11.h, p5/M, z11.h, z25.h\n"
+ "fmin z12.h, p5/M, z12.h, z25.h\n"
+ "fmin z13.h, p5/M, z13.h, z25.h\n"
+ "fmin z14.h, p5/M, z14.h, z25.h\n"
+ "fmin z15.h, p5/M, z15.h, z25.h\n"
+ "fmin z16.h, p5/M, z16.h, z25.h\n"
+ "fmin z17.h, p5/M, z17.h, z25.h\n"
+ "fmin z18.h, p5/M, z18.h, z25.h\n"
+ "fmin z19.h, p5/M, z19.h, z25.h\n"
+ "fmin z20.h, p5/M, z20.h, z25.h\n"
+ "fmin z21.h, p5/M, z21.h, z25.h\n"
+ "fmin z22.h, p5/M, z22.h, z25.h\n"
+ "fmin z23.h, p5/M, z23.h, z25.h\n"
+ "fmax z8.h, p5/M, z8.h, z24.h\n"
+ "fmax z9.h, p5/M, z9.h, z24.h\n"
+ "fmax z10.h, p5/M, z10.h, z24.h\n"
+ "fmax z11.h, p5/M, z11.h, z24.h\n"
+ "fmax z12.h, p5/M, z12.h, z24.h\n"
+ "fmax z13.h, p5/M, z13.h, z24.h\n"
+ "fmax z14.h, p5/M, z14.h, z24.h\n"
+ "fmax z15.h, p5/M, z15.h, z24.h\n"
+ "fmax z16.h, p5/M, z16.h, z24.h\n"
+ "fmax z17.h, p5/M, z17.h, z24.h\n"
+ "fmax z18.h, p5/M, z18.h, z24.h\n"
+ "fmax z19.h, p5/M, z19.h, z24.h\n"
+ "fmax z20.h, p5/M, z20.h, z24.h\n"
+ "fmax z21.h, p5/M, z21.h, z24.h\n"
+ "fmax z22.h, p5/M, z22.h, z24.h\n"
+ "fmax z23.h, p5/M, z23.h, z24.h\n"
"51:" // Height 4: No activation
"st1h { z8.h }, p4, [x9]\n"
"st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -1695,30 +1695,30 @@ void sve_hybrid_fp16_mla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #1\n"
+ "add x23, x9, x20, LSL #1\n"
"add x22, x23, x20, LSL #1\n"
+ "ld1h { z8.h }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23]\n"
- "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p4/Z, [x22]\n"
- "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x23]\n"
+ "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x22]\n"
+ "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x21]\n"
+ "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x20]\n"
+ "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 57f\n"
"56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1746,15 +1746,15 @@ void sve_hybrid_fp16_mla_6x4VL (
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -1765,221 +1765,221 @@ void sve_hybrid_fp16_mla_6x4VL (
"b 60f\n"
"59:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
"60:" // Height 5: input setup done
"cmp x27, #0x8\n"
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z4.h }, p0/Z, [x26]\n"
+ "ld1rqh { z3.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
"ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z1.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1rqh { z0.h }, p0/Z, [x22]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z29.h, z4.h[0]\n"
+ "fmla z12.h, z29.h, z3.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.h, z29.h, z2.h[0]\n"
+ "fmla z20.h, z29.h, z1.h[0]\n"
"add x25, x25, #0x10\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z24.h, z29.h, z0.h[0]\n"
+ "fmla z9.h, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x24, x24, #0x10\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z13.h, z28.h, z3.h[0]\n"
+ "fmla z17.h, z28.h, z2.h[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z21.h, z28.h, z1.h[0]\n"
+ "fmla z25.h, z28.h, z0.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[0]\n"
+ "fmla z14.h, z29.h, z3.h[0]\n"
+ "fmla z18.h, z29.h, z2.h[0]\n"
+ "fmla z22.h, z29.h, z1.h[0]\n"
+ "fmla z26.h, z29.h, z0.h[0]\n"
+ "fmla z11.h, z28.h, z4.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[0]\n"
+ "fmla z19.h, z28.h, z2.h[0]\n"
+ "fmla z23.h, z28.h, z1.h[0]\n"
+ "fmla z27.h, z28.h, z0.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[1]\n"
+ "fmla z12.h, z29.h, z3.h[1]\n"
+ "fmla z16.h, z29.h, z2.h[1]\n"
+ "fmla z20.h, z29.h, z1.h[1]\n"
+ "fmla z24.h, z29.h, z0.h[1]\n"
+ "fmla z9.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[1]\n"
+ "fmla z17.h, z28.h, z2.h[1]\n"
+ "fmla z21.h, z28.h, z1.h[1]\n"
+ "fmla z25.h, z28.h, z0.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[1]\n"
+ "fmla z14.h, z29.h, z3.h[1]\n"
+ "fmla z18.h, z29.h, z2.h[1]\n"
+ "fmla z22.h, z29.h, z1.h[1]\n"
+ "fmla z26.h, z29.h, z0.h[1]\n"
+ "fmla z11.h, z28.h, z4.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[1]\n"
+ "fmla z19.h, z28.h, z2.h[1]\n"
+ "fmla z23.h, z28.h, z1.h[1]\n"
+ "fmla z27.h, z28.h, z0.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[2]\n"
+ "fmla z12.h, z29.h, z3.h[2]\n"
+ "fmla z16.h, z29.h, z2.h[2]\n"
+ "fmla z20.h, z29.h, z1.h[2]\n"
+ "fmla z24.h, z29.h, z0.h[2]\n"
+ "fmla z9.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[2]\n"
+ "fmla z17.h, z28.h, z2.h[2]\n"
+ "fmla z21.h, z28.h, z1.h[2]\n"
+ "fmla z25.h, z28.h, z0.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[2]\n"
+ "fmla z14.h, z29.h, z3.h[2]\n"
+ "fmla z18.h, z29.h, z2.h[2]\n"
+ "fmla z22.h, z29.h, z1.h[2]\n"
+ "fmla z26.h, z29.h, z0.h[2]\n"
+ "fmla z11.h, z28.h, z4.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[2]\n"
+ "fmla z19.h, z28.h, z2.h[2]\n"
+ "fmla z23.h, z28.h, z1.h[2]\n"
+ "fmla z27.h, z28.h, z0.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[3]\n"
+ "fmla z12.h, z29.h, z3.h[3]\n"
+ "fmla z16.h, z29.h, z2.h[3]\n"
+ "fmla z20.h, z29.h, z1.h[3]\n"
+ "fmla z24.h, z29.h, z0.h[3]\n"
+ "fmla z9.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[3]\n"
+ "fmla z17.h, z28.h, z2.h[3]\n"
+ "fmla z21.h, z28.h, z1.h[3]\n"
+ "fmla z25.h, z28.h, z0.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[3]\n"
+ "fmla z14.h, z29.h, z3.h[3]\n"
+ "fmla z18.h, z29.h, z2.h[3]\n"
+ "fmla z22.h, z29.h, z1.h[3]\n"
+ "fmla z26.h, z29.h, z0.h[3]\n"
+ "fmla z11.h, z28.h, z4.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "fmla z15.h, z28.h, z3.h[3]\n"
+ "fmla z19.h, z28.h, z2.h[3]\n"
+ "fmla z23.h, z28.h, z1.h[3]\n"
+ "fmla z27.h, z28.h, z0.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[4]\n"
+ "fmla z12.h, z29.h, z3.h[4]\n"
+ "fmla z16.h, z29.h, z2.h[4]\n"
+ "fmla z20.h, z29.h, z1.h[4]\n"
+ "fmla z24.h, z29.h, z0.h[4]\n"
+ "fmla z9.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[4]\n"
+ "fmla z17.h, z28.h, z2.h[4]\n"
+ "fmla z21.h, z28.h, z1.h[4]\n"
+ "fmla z25.h, z28.h, z0.h[4]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[4]\n"
+ "fmla z14.h, z29.h, z3.h[4]\n"
+ "fmla z18.h, z29.h, z2.h[4]\n"
+ "fmla z22.h, z29.h, z1.h[4]\n"
+ "fmla z26.h, z29.h, z0.h[4]\n"
+ "fmla z11.h, z28.h, z4.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[4]\n"
+ "fmla z19.h, z28.h, z2.h[4]\n"
+ "fmla z23.h, z28.h, z1.h[4]\n"
+ "fmla z27.h, z28.h, z0.h[4]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[5]\n"
+ "fmla z12.h, z29.h, z3.h[5]\n"
+ "fmla z16.h, z29.h, z2.h[5]\n"
+ "fmla z20.h, z29.h, z1.h[5]\n"
+ "fmla z24.h, z29.h, z0.h[5]\n"
+ "fmla z9.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[5]\n"
+ "fmla z17.h, z28.h, z2.h[5]\n"
+ "fmla z21.h, z28.h, z1.h[5]\n"
+ "fmla z25.h, z28.h, z0.h[5]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z10.h, z29.h, z4.h[5]\n"
+ "fmla z14.h, z29.h, z3.h[5]\n"
+ "fmla z18.h, z29.h, z2.h[5]\n"
+ "fmla z22.h, z29.h, z1.h[5]\n"
+ "fmla z26.h, z29.h, z0.h[5]\n"
+ "fmla z11.h, z28.h, z4.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[5]\n"
+ "fmla z19.h, z28.h, z2.h[5]\n"
+ "fmla z23.h, z28.h, z1.h[5]\n"
+ "fmla z27.h, z28.h, z0.h[5]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[6]\n"
+ "fmla z12.h, z29.h, z3.h[6]\n"
+ "fmla z16.h, z29.h, z2.h[6]\n"
+ "fmla z20.h, z29.h, z1.h[6]\n"
+ "fmla z24.h, z29.h, z0.h[6]\n"
+ "fmla z9.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[6]\n"
+ "fmla z17.h, z28.h, z2.h[6]\n"
+ "fmla z21.h, z28.h, z1.h[6]\n"
+ "fmla z25.h, z28.h, z0.h[6]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[6]\n"
+ "fmla z14.h, z29.h, z3.h[6]\n"
+ "fmla z18.h, z29.h, z2.h[6]\n"
+ "fmla z22.h, z29.h, z1.h[6]\n"
+ "fmla z26.h, z29.h, z0.h[6]\n"
+ "fmla z11.h, z28.h, z4.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z15.h, z28.h, z3.h[6]\n"
+ "fmla z19.h, z28.h, z2.h[6]\n"
+ "fmla z23.h, z28.h, z1.h[6]\n"
+ "fmla z27.h, z28.h, z0.h[6]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z29.h, z4.h[7]\n"
+ "fmla z12.h, z29.h, z3.h[7]\n"
+ "fmla z16.h, z29.h, z2.h[7]\n"
+ "fmla z20.h, z29.h, z1.h[7]\n"
+ "fmla z24.h, z29.h, z0.h[7]\n"
+ "fmla z9.h, z28.h, z4.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z13.h, z28.h, z3.h[7]\n"
+ "fmla z17.h, z28.h, z2.h[7]\n"
+ "fmla z21.h, z28.h, z1.h[7]\n"
+ "fmla z25.h, z28.h, z0.h[7]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z29.h, z4.h[7]\n"
+ "fmla z14.h, z29.h, z3.h[7]\n"
+ "fmla z18.h, z29.h, z2.h[7]\n"
+ "fmla z22.h, z29.h, z1.h[7]\n"
+ "fmla z26.h, z29.h, z0.h[7]\n"
+ "fmla z11.h, z28.h, z4.h[7]\n"
+ "fmla z15.h, z28.h, z3.h[7]\n"
+ "fmla z19.h, z28.h, z2.h[7]\n"
+ "fmla z23.h, z28.h, z1.h[7]\n"
+ "fmla z27.h, z28.h, z0.h[7]\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -1989,219 +1989,219 @@ void sve_hybrid_fp16_mla_6x4VL (
"ld1rqh { z2.h }, p0/Z, [x24]\n"
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "fmla z8.h, z29.h, z0.h[0]\n"
+ "fmla z12.h, z29.h, z1.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.h, z29.h, z2.h[0]\n"
+ "fmla z20.h, z29.h, z3.h[0]\n"
+ "fmla z24.h, z29.h, z4.h[0]\n"
+ "fmla z9.h, z28.h, z0.h[0]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[0]\n"
+ "fmla z17.h, z28.h, z2.h[0]\n"
+ "fmla z21.h, z28.h, z3.h[0]\n"
+ "fmla z25.h, z28.h, z4.h[0]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
+ "fmla z10.h, z29.h, z0.h[0]\n"
+ "fmla z14.h, z29.h, z1.h[0]\n"
+ "fmla z18.h, z29.h, z2.h[0]\n"
+ "fmla z22.h, z29.h, z3.h[0]\n"
+ "fmla z26.h, z29.h, z4.h[0]\n"
+ "fmla z11.h, z28.h, z0.h[0]\n"
+ "fmla z15.h, z28.h, z1.h[0]\n"
+ "fmla z19.h, z28.h, z2.h[0]\n"
+ "fmla z23.h, z28.h, z3.h[0]\n"
+ "fmla z27.h, z28.h, z4.h[0]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[1]\n"
+ "fmla z12.h, z29.h, z1.h[1]\n"
+ "fmla z16.h, z29.h, z2.h[1]\n"
+ "fmla z20.h, z29.h, z3.h[1]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z29.h, z4.h[1]\n"
+ "fmla z9.h, z28.h, z0.h[1]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[1]\n"
+ "fmla z17.h, z28.h, z2.h[1]\n"
+ "fmla z21.h, z28.h, z3.h[1]\n"
+ "fmla z25.h, z28.h, z4.h[1]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
+ "fmla z10.h, z29.h, z0.h[1]\n"
+ "fmla z14.h, z29.h, z1.h[1]\n"
+ "fmla z18.h, z29.h, z2.h[1]\n"
+ "fmla z22.h, z29.h, z3.h[1]\n"
+ "fmla z26.h, z29.h, z4.h[1]\n"
+ "fmla z11.h, z28.h, z0.h[1]\n"
+ "fmla z15.h, z28.h, z1.h[1]\n"
+ "fmla z19.h, z28.h, z2.h[1]\n"
+ "fmla z23.h, z28.h, z3.h[1]\n"
+ "fmla z27.h, z28.h, z4.h[1]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[2]\n"
+ "fmla z12.h, z29.h, z1.h[2]\n"
+ "fmla z16.h, z29.h, z2.h[2]\n"
+ "fmla z20.h, z29.h, z3.h[2]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z29.h, z4.h[2]\n"
+ "fmla z9.h, z28.h, z0.h[2]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[2]\n"
+ "fmla z17.h, z28.h, z2.h[2]\n"
+ "fmla z21.h, z28.h, z3.h[2]\n"
+ "fmla z25.h, z28.h, z4.h[2]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
+ "fmla z10.h, z29.h, z0.h[2]\n"
+ "fmla z14.h, z29.h, z1.h[2]\n"
+ "fmla z18.h, z29.h, z2.h[2]\n"
+ "fmla z22.h, z29.h, z3.h[2]\n"
+ "fmla z26.h, z29.h, z4.h[2]\n"
+ "fmla z11.h, z28.h, z0.h[2]\n"
+ "fmla z15.h, z28.h, z1.h[2]\n"
+ "fmla z19.h, z28.h, z2.h[2]\n"
+ "fmla z23.h, z28.h, z3.h[2]\n"
+ "fmla z27.h, z28.h, z4.h[2]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[3]\n"
+ "fmla z12.h, z29.h, z1.h[3]\n"
+ "fmla z16.h, z29.h, z2.h[3]\n"
+ "fmla z20.h, z29.h, z3.h[3]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z29.h, z4.h[3]\n"
+ "fmla z9.h, z28.h, z0.h[3]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[3]\n"
+ "fmla z17.h, z28.h, z2.h[3]\n"
+ "fmla z21.h, z28.h, z3.h[3]\n"
+ "fmla z25.h, z28.h, z4.h[3]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
+ "fmla z10.h, z29.h, z0.h[3]\n"
+ "fmla z14.h, z29.h, z1.h[3]\n"
+ "fmla z18.h, z29.h, z2.h[3]\n"
+ "fmla z22.h, z29.h, z3.h[3]\n"
+ "fmla z26.h, z29.h, z4.h[3]\n"
+ "fmla z11.h, z28.h, z0.h[3]\n"
+ "fmla z15.h, z28.h, z1.h[3]\n"
+ "fmla z19.h, z28.h, z2.h[3]\n"
+ "fmla z23.h, z28.h, z3.h[3]\n"
+ "fmla z27.h, z28.h, z4.h[3]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[4]\n"
+ "fmla z12.h, z29.h, z1.h[4]\n"
+ "fmla z16.h, z29.h, z2.h[4]\n"
+ "fmla z20.h, z29.h, z3.h[4]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z29.h, z4.h[4]\n"
+ "fmla z9.h, z28.h, z0.h[4]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[4]\n"
+ "fmla z17.h, z28.h, z2.h[4]\n"
+ "fmla z21.h, z28.h, z3.h[4]\n"
+ "fmla z25.h, z28.h, z4.h[4]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
+ "fmla z10.h, z29.h, z0.h[4]\n"
+ "fmla z14.h, z29.h, z1.h[4]\n"
+ "fmla z18.h, z29.h, z2.h[4]\n"
+ "fmla z22.h, z29.h, z3.h[4]\n"
+ "fmla z26.h, z29.h, z4.h[4]\n"
+ "fmla z11.h, z28.h, z0.h[4]\n"
+ "fmla z15.h, z28.h, z1.h[4]\n"
+ "fmla z19.h, z28.h, z2.h[4]\n"
+ "fmla z23.h, z28.h, z3.h[4]\n"
+ "fmla z27.h, z28.h, z4.h[4]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[5]\n"
+ "fmla z12.h, z29.h, z1.h[5]\n"
+ "fmla z16.h, z29.h, z2.h[5]\n"
+ "fmla z20.h, z29.h, z3.h[5]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z29.h, z4.h[5]\n"
+ "fmla z9.h, z28.h, z0.h[5]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[5]\n"
+ "fmla z17.h, z28.h, z2.h[5]\n"
+ "fmla z21.h, z28.h, z3.h[5]\n"
+ "fmla z25.h, z28.h, z4.h[5]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
+ "fmla z10.h, z29.h, z0.h[5]\n"
+ "fmla z14.h, z29.h, z1.h[5]\n"
+ "fmla z18.h, z29.h, z2.h[5]\n"
+ "fmla z22.h, z29.h, z3.h[5]\n"
+ "fmla z26.h, z29.h, z4.h[5]\n"
+ "fmla z11.h, z28.h, z0.h[5]\n"
+ "fmla z15.h, z28.h, z1.h[5]\n"
+ "fmla z19.h, z28.h, z2.h[5]\n"
+ "fmla z23.h, z28.h, z3.h[5]\n"
+ "fmla z27.h, z28.h, z4.h[5]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[6]\n"
+ "fmla z12.h, z29.h, z1.h[6]\n"
+ "fmla z16.h, z29.h, z2.h[6]\n"
+ "fmla z20.h, z29.h, z3.h[6]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z29.h, z4.h[6]\n"
+ "fmla z9.h, z28.h, z0.h[6]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[6]\n"
+ "fmla z17.h, z28.h, z2.h[6]\n"
+ "fmla z21.h, z28.h, z3.h[6]\n"
+ "fmla z25.h, z28.h, z4.h[6]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
+ "fmla z10.h, z29.h, z0.h[6]\n"
+ "fmla z14.h, z29.h, z1.h[6]\n"
+ "fmla z18.h, z29.h, z2.h[6]\n"
+ "fmla z22.h, z29.h, z3.h[6]\n"
+ "fmla z26.h, z29.h, z4.h[6]\n"
+ "fmla z11.h, z28.h, z0.h[6]\n"
+ "fmla z15.h, z28.h, z1.h[6]\n"
+ "fmla z19.h, z28.h, z2.h[6]\n"
+ "fmla z23.h, z28.h, z3.h[6]\n"
+ "fmla z27.h, z28.h, z4.h[6]\n"
"ble 63f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z29.h }, p5/Z, [x10]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z29.h, z0.h[7]\n"
+ "fmla z12.h, z29.h, z1.h[7]\n"
+ "fmla z16.h, z29.h, z2.h[7]\n"
+ "fmla z20.h, z29.h, z3.h[7]\n"
+ "fmla z24.h, z29.h, z4.h[7]\n"
+ "fmla z9.h, z28.h, z0.h[7]\n"
+ "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.h, z28.h, z1.h[7]\n"
+ "fmla z17.h, z28.h, z2.h[7]\n"
+ "fmla z21.h, z28.h, z3.h[7]\n"
+ "fmla z25.h, z28.h, z4.h[7]\n"
+ "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z10.h, z29.h, z0.h[7]\n"
+ "fmla z14.h, z29.h, z1.h[7]\n"
+ "fmla z18.h, z29.h, z2.h[7]\n"
+ "fmla z22.h, z29.h, z3.h[7]\n"
+ "fmla z26.h, z29.h, z4.h[7]\n"
+ "fmla z11.h, z28.h, z0.h[7]\n"
+ "fmla z15.h, z28.h, z1.h[7]\n"
+ "fmla z19.h, z28.h, z2.h[7]\n"
+ "fmla z23.h, z28.h, z3.h[7]\n"
+ "fmla z27.h, z28.h, z4.h[7]\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2214,49 +2214,49 @@ void sve_hybrid_fp16_mla_6x4VL (
"add x22, x23, x20, LSL #1\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rh { z1.h }, p5/Z, [x20]\n"
+ "ld1rh { z29.h }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rh { z0.h }, p5/Z, [x20]\n"
- "fmin z8.h, p5/M, z8.h, z1.h\n"
- "fmin z9.h, p5/M, z9.h, z1.h\n"
- "fmin z10.h, p5/M, z10.h, z1.h\n"
- "fmin z11.h, p5/M, z11.h, z1.h\n"
- "fmin z12.h, p5/M, z12.h, z1.h\n"
- "fmin z13.h, p5/M, z13.h, z1.h\n"
- "fmin z14.h, p5/M, z14.h, z1.h\n"
- "fmin z15.h, p5/M, z15.h, z1.h\n"
- "fmin z16.h, p5/M, z16.h, z1.h\n"
- "fmin z17.h, p5/M, z17.h, z1.h\n"
- "fmin z18.h, p5/M, z18.h, z1.h\n"
- "fmin z19.h, p5/M, z19.h, z1.h\n"
- "fmin z20.h, p5/M, z20.h, z1.h\n"
- "fmin z21.h, p5/M, z21.h, z1.h\n"
- "fmin z22.h, p5/M, z22.h, z1.h\n"
- "fmin z23.h, p5/M, z23.h, z1.h\n"
- "fmin z24.h, p5/M, z24.h, z1.h\n"
- "fmin z25.h, p5/M, z25.h, z1.h\n"
- "fmin z26.h, p5/M, z26.h, z1.h\n"
- "fmin z27.h, p5/M, z27.h, z1.h\n"
- "fmax z8.h, p5/M, z8.h, z0.h\n"
- "fmax z9.h, p5/M, z9.h, z0.h\n"
- "fmax z10.h, p5/M, z10.h, z0.h\n"
- "fmax z11.h, p5/M, z11.h, z0.h\n"
- "fmax z12.h, p5/M, z12.h, z0.h\n"
- "fmax z13.h, p5/M, z13.h, z0.h\n"
- "fmax z14.h, p5/M, z14.h, z0.h\n"
- "fmax z15.h, p5/M, z15.h, z0.h\n"
- "fmax z16.h, p5/M, z16.h, z0.h\n"
- "fmax z17.h, p5/M, z17.h, z0.h\n"
- "fmax z18.h, p5/M, z18.h, z0.h\n"
- "fmax z19.h, p5/M, z19.h, z0.h\n"
- "fmax z20.h, p5/M, z20.h, z0.h\n"
- "fmax z21.h, p5/M, z21.h, z0.h\n"
- "fmax z22.h, p5/M, z22.h, z0.h\n"
- "fmax z23.h, p5/M, z23.h, z0.h\n"
- "fmax z24.h, p5/M, z24.h, z0.h\n"
- "fmax z25.h, p5/M, z25.h, z0.h\n"
- "fmax z26.h, p5/M, z26.h, z0.h\n"
- "fmax z27.h, p5/M, z27.h, z0.h\n"
+ "ld1rh { z28.h }, p5/Z, [x20]\n"
+ "fmin z8.h, p5/M, z8.h, z29.h\n"
+ "fmin z9.h, p5/M, z9.h, z29.h\n"
+ "fmin z10.h, p5/M, z10.h, z29.h\n"
+ "fmin z11.h, p5/M, z11.h, z29.h\n"
+ "fmin z12.h, p5/M, z12.h, z29.h\n"
+ "fmin z13.h, p5/M, z13.h, z29.h\n"
+ "fmin z14.h, p5/M, z14.h, z29.h\n"
+ "fmin z15.h, p5/M, z15.h, z29.h\n"
+ "fmin z16.h, p5/M, z16.h, z29.h\n"
+ "fmin z17.h, p5/M, z17.h, z29.h\n"
+ "fmin z18.h, p5/M, z18.h, z29.h\n"
+ "fmin z19.h, p5/M, z19.h, z29.h\n"
+ "fmin z20.h, p5/M, z20.h, z29.h\n"
+ "fmin z21.h, p5/M, z21.h, z29.h\n"
+ "fmin z22.h, p5/M, z22.h, z29.h\n"
+ "fmin z23.h, p5/M, z23.h, z29.h\n"
+ "fmin z24.h, p5/M, z24.h, z29.h\n"
+ "fmin z25.h, p5/M, z25.h, z29.h\n"
+ "fmin z26.h, p5/M, z26.h, z29.h\n"
+ "fmin z27.h, p5/M, z27.h, z29.h\n"
+ "fmax z8.h, p5/M, z8.h, z28.h\n"
+ "fmax z9.h, p5/M, z9.h, z28.h\n"
+ "fmax z10.h, p5/M, z10.h, z28.h\n"
+ "fmax z11.h, p5/M, z11.h, z28.h\n"
+ "fmax z12.h, p5/M, z12.h, z28.h\n"
+ "fmax z13.h, p5/M, z13.h, z28.h\n"
+ "fmax z14.h, p5/M, z14.h, z28.h\n"
+ "fmax z15.h, p5/M, z15.h, z28.h\n"
+ "fmax z16.h, p5/M, z16.h, z28.h\n"
+ "fmax z17.h, p5/M, z17.h, z28.h\n"
+ "fmax z18.h, p5/M, z18.h, z28.h\n"
+ "fmax z19.h, p5/M, z19.h, z28.h\n"
+ "fmax z20.h, p5/M, z20.h, z28.h\n"
+ "fmax z21.h, p5/M, z21.h, z28.h\n"
+ "fmax z22.h, p5/M, z22.h, z28.h\n"
+ "fmax z23.h, p5/M, z23.h, z28.h\n"
+ "fmax z24.h, p5/M, z24.h, z28.h\n"
+ "fmax z25.h, p5/M, z25.h, z28.h\n"
+ "fmax z26.h, p5/M, z26.h, z28.h\n"
+ "fmax z27.h, p5/M, z27.h, z28.h\n"
"64:" // Height 5: No activation
"st1h { z8.h }, p4, [x9]\n"
"st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -2331,35 +2331,35 @@ void sve_hybrid_fp16_mla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "ld1h { z8.h }, p4/Z, [x9]\n"
+ "add x24, x9, x20, LSL #1\n"
"add x23, x24, x20, LSL #1\n"
+ "ld1h { z8.h }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #1\n"
+ "add x21, x22, x20, LSL #1\n"
"ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
"ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x20, x21, x20, LSL #1\n"
"ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x25]\n"
- "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x24]\n"
- "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x23]\n"
- "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1h { z24.h }, p4/Z, [x22]\n"
- "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1h { z28.h }, p4/Z, [x21]\n"
- "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x23]\n"
+ "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x22]\n"
+ "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x21]\n"
+ "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x20]\n"
+ "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n"
"b 70f\n"
"69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -2391,16 +2391,16 @@ void sve_hybrid_fp16_mla_6x4VL (
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 73f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #1\n"
@@ -2412,256 +2412,256 @@ void sve_hybrid_fp16_mla_6x4VL (
"b 73f\n"
"72:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
- "add x21, x22, x20, LSL #1\n"
+ "add x25, x26, x21, LSL #1\n"
+ "add x24, x25, x21, LSL #1\n"
+ "add x23, x24, x21, LSL #1\n"
+ "add x22, x23, x21, LSL #1\n"
+ "add x21, x22, x21, LSL #1\n"
"73:" // Height 6: input setup done
"cmp x27, #0x8\n"
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.h, XZR, x27\n"
- "ld1rqh { z0.h }, p0/Z, [x26]\n"
- "ld1rqh { z1.h }, p0/Z, [x25]\n"
+ "ld1rqh { z7.h }, p0/Z, [x26]\n"
+ "ld1rqh { z6.h }, p0/Z, [x25]\n"
"sub x27, x27, #0x8\n"
- "ld1rqh { z2.h }, p0/Z, [x24]\n"
- "ld1rqh { z3.h }, p0/Z, [x23]\n"
+ "ld1rqh { z5.h }, p0/Z, [x24]\n"
+ "ld1rqh { z4.h }, p0/Z, [x23]\n"
"cmp x27, #0x8\n"
"add x26, x26, #0x10\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "ld1rqh { z5.h }, p0/Z, [x21]\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "ld1rqh { z2.h }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[0]\n"
+ "fmla z12.h, z1.h, z6.h[0]\n"
+ "fmla z16.h, z1.h, z5.h[0]\n"
+ "fmla z20.h, z1.h, z4.h[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z28.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z24.h, z1.h, z3.h[0]\n"
+ "fmla z28.h, z1.h, z2.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x21, x21, #0x10\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "fmla z29.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z30.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
- "fmla z31.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z28.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "fmla z29.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[0]\n"
+ "fmla z13.h, z0.h, z6.h[0]\n"
+ "fmla z17.h, z0.h, z5.h[0]\n"
+ "fmla z21.h, z0.h, z4.h[0]\n"
+ "fmla z25.h, z0.h, z3.h[0]\n"
+ "fmla z29.h, z0.h, z2.h[0]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[0]\n"
+ "fmla z14.h, z1.h, z6.h[0]\n"
+ "fmla z18.h, z1.h, z5.h[0]\n"
+ "fmla z22.h, z1.h, z4.h[0]\n"
+ "fmla z26.h, z1.h, z3.h[0]\n"
+ "fmla z30.h, z1.h, z2.h[0]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[0]\n"
+ "fmla z15.h, z0.h, z6.h[0]\n"
+ "fmla z19.h, z0.h, z5.h[0]\n"
+ "fmla z23.h, z0.h, z4.h[0]\n"
+ "fmla z27.h, z0.h, z3.h[0]\n"
+ "fmla z31.h, z0.h, z2.h[0]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[1]\n"
+ "fmla z12.h, z1.h, z6.h[1]\n"
+ "fmla z16.h, z1.h, z5.h[1]\n"
+ "fmla z20.h, z1.h, z4.h[1]\n"
+ "fmla z24.h, z1.h, z3.h[1]\n"
+ "fmla z28.h, z1.h, z2.h[1]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[1]\n"
+ "fmla z13.h, z0.h, z6.h[1]\n"
+ "fmla z17.h, z0.h, z5.h[1]\n"
+ "fmla z21.h, z0.h, z4.h[1]\n"
+ "fmla z25.h, z0.h, z3.h[1]\n"
+ "fmla z29.h, z0.h, z2.h[1]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z30.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
- "fmla z31.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z28.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "fmla z29.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z30.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
- "fmla z31.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z28.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "fmla z29.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z30.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
- "fmla z31.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z28.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "fmla z29.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z30.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
- "fmla z31.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z28.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "fmla z29.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[1]\n"
+ "fmla z14.h, z1.h, z6.h[1]\n"
+ "fmla z18.h, z1.h, z5.h[1]\n"
+ "fmla z22.h, z1.h, z4.h[1]\n"
+ "fmla z26.h, z1.h, z3.h[1]\n"
+ "fmla z30.h, z1.h, z2.h[1]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[1]\n"
+ "fmla z15.h, z0.h, z6.h[1]\n"
+ "fmla z19.h, z0.h, z5.h[1]\n"
+ "fmla z23.h, z0.h, z4.h[1]\n"
+ "fmla z27.h, z0.h, z3.h[1]\n"
+ "fmla z31.h, z0.h, z2.h[1]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[2]\n"
+ "fmla z12.h, z1.h, z6.h[2]\n"
+ "fmla z16.h, z1.h, z5.h[2]\n"
+ "fmla z20.h, z1.h, z4.h[2]\n"
+ "fmla z24.h, z1.h, z3.h[2]\n"
+ "fmla z28.h, z1.h, z2.h[2]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[2]\n"
+ "fmla z13.h, z0.h, z6.h[2]\n"
+ "fmla z17.h, z0.h, z5.h[2]\n"
+ "fmla z21.h, z0.h, z4.h[2]\n"
+ "fmla z25.h, z0.h, z3.h[2]\n"
+ "fmla z29.h, z0.h, z2.h[2]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[2]\n"
+ "fmla z14.h, z1.h, z6.h[2]\n"
+ "fmla z18.h, z1.h, z5.h[2]\n"
+ "fmla z22.h, z1.h, z4.h[2]\n"
+ "fmla z26.h, z1.h, z3.h[2]\n"
+ "fmla z30.h, z1.h, z2.h[2]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[2]\n"
+ "fmla z15.h, z0.h, z6.h[2]\n"
+ "fmla z19.h, z0.h, z5.h[2]\n"
+ "fmla z23.h, z0.h, z4.h[2]\n"
+ "fmla z27.h, z0.h, z3.h[2]\n"
+ "fmla z31.h, z0.h, z2.h[2]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[3]\n"
+ "fmla z12.h, z1.h, z6.h[3]\n"
+ "fmla z16.h, z1.h, z5.h[3]\n"
+ "fmla z20.h, z1.h, z4.h[3]\n"
+ "fmla z24.h, z1.h, z3.h[3]\n"
+ "fmla z28.h, z1.h, z2.h[3]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[3]\n"
+ "fmla z13.h, z0.h, z6.h[3]\n"
+ "fmla z17.h, z0.h, z5.h[3]\n"
+ "fmla z21.h, z0.h, z4.h[3]\n"
+ "fmla z25.h, z0.h, z3.h[3]\n"
+ "fmla z29.h, z0.h, z2.h[3]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[3]\n"
+ "fmla z14.h, z1.h, z6.h[3]\n"
+ "fmla z18.h, z1.h, z5.h[3]\n"
+ "fmla z22.h, z1.h, z4.h[3]\n"
+ "fmla z26.h, z1.h, z3.h[3]\n"
+ "fmla z30.h, z1.h, z2.h[3]\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "fmla z11.h, z0.h, z7.h[3]\n"
+ "fmla z15.h, z0.h, z6.h[3]\n"
+ "fmla z19.h, z0.h, z5.h[3]\n"
+ "fmla z23.h, z0.h, z4.h[3]\n"
+ "fmla z27.h, z0.h, z3.h[3]\n"
+ "fmla z31.h, z0.h, z2.h[3]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[4]\n"
+ "fmla z12.h, z1.h, z6.h[4]\n"
+ "fmla z16.h, z1.h, z5.h[4]\n"
+ "fmla z20.h, z1.h, z4.h[4]\n"
+ "fmla z24.h, z1.h, z3.h[4]\n"
+ "fmla z28.h, z1.h, z2.h[4]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[4]\n"
+ "fmla z13.h, z0.h, z6.h[4]\n"
+ "fmla z17.h, z0.h, z5.h[4]\n"
+ "fmla z21.h, z0.h, z4.h[4]\n"
+ "fmla z25.h, z0.h, z3.h[4]\n"
+ "fmla z29.h, z0.h, z2.h[4]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[4]\n"
+ "fmla z14.h, z1.h, z6.h[4]\n"
+ "fmla z18.h, z1.h, z5.h[4]\n"
+ "fmla z22.h, z1.h, z4.h[4]\n"
+ "fmla z26.h, z1.h, z3.h[4]\n"
+ "fmla z30.h, z1.h, z2.h[4]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[4]\n"
+ "fmla z15.h, z0.h, z6.h[4]\n"
+ "fmla z19.h, z0.h, z5.h[4]\n"
+ "fmla z23.h, z0.h, z4.h[4]\n"
+ "fmla z27.h, z0.h, z3.h[4]\n"
+ "fmla z31.h, z0.h, z2.h[4]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[5]\n"
+ "fmla z12.h, z1.h, z6.h[5]\n"
+ "fmla z16.h, z1.h, z5.h[5]\n"
+ "fmla z20.h, z1.h, z4.h[5]\n"
+ "fmla z24.h, z1.h, z3.h[5]\n"
+ "fmla z28.h, z1.h, z2.h[5]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[5]\n"
+ "fmla z13.h, z0.h, z6.h[5]\n"
+ "fmla z17.h, z0.h, z5.h[5]\n"
+ "fmla z21.h, z0.h, z4.h[5]\n"
+ "fmla z25.h, z0.h, z3.h[5]\n"
+ "fmla z29.h, z0.h, z2.h[5]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z30.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
- "fmla z31.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z28.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "fmla z29.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z30.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
- "fmla z31.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z28.h, z6.h, z5.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "fmla z29.h, z7.h, z5.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z30.h, z6.h, z5.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
- "fmla z31.h, z7.h, z5.h[7]\n"
+ "fmla z10.h, z1.h, z7.h[5]\n"
+ "fmla z14.h, z1.h, z6.h[5]\n"
+ "fmla z18.h, z1.h, z5.h[5]\n"
+ "fmla z22.h, z1.h, z4.h[5]\n"
+ "fmla z26.h, z1.h, z3.h[5]\n"
+ "fmla z30.h, z1.h, z2.h[5]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[5]\n"
+ "fmla z15.h, z0.h, z6.h[5]\n"
+ "fmla z19.h, z0.h, z5.h[5]\n"
+ "fmla z23.h, z0.h, z4.h[5]\n"
+ "fmla z27.h, z0.h, z3.h[5]\n"
+ "fmla z31.h, z0.h, z2.h[5]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[6]\n"
+ "fmla z12.h, z1.h, z6.h[6]\n"
+ "fmla z16.h, z1.h, z5.h[6]\n"
+ "fmla z20.h, z1.h, z4.h[6]\n"
+ "fmla z24.h, z1.h, z3.h[6]\n"
+ "fmla z28.h, z1.h, z2.h[6]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[6]\n"
+ "fmla z13.h, z0.h, z6.h[6]\n"
+ "fmla z17.h, z0.h, z5.h[6]\n"
+ "fmla z21.h, z0.h, z4.h[6]\n"
+ "fmla z25.h, z0.h, z3.h[6]\n"
+ "fmla z29.h, z0.h, z2.h[6]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[6]\n"
+ "fmla z14.h, z1.h, z6.h[6]\n"
+ "fmla z18.h, z1.h, z5.h[6]\n"
+ "fmla z22.h, z1.h, z4.h[6]\n"
+ "fmla z26.h, z1.h, z3.h[6]\n"
+ "fmla z30.h, z1.h, z2.h[6]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.h, z0.h, z7.h[6]\n"
+ "fmla z15.h, z0.h, z6.h[6]\n"
+ "fmla z19.h, z0.h, z5.h[6]\n"
+ "fmla z23.h, z0.h, z4.h[6]\n"
+ "fmla z27.h, z0.h, z3.h[6]\n"
+ "fmla z31.h, z0.h, z2.h[6]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.h, z1.h, z7.h[7]\n"
+ "fmla z12.h, z1.h, z6.h[7]\n"
+ "fmla z16.h, z1.h, z5.h[7]\n"
+ "fmla z20.h, z1.h, z4.h[7]\n"
+ "fmla z24.h, z1.h, z3.h[7]\n"
+ "fmla z28.h, z1.h, z2.h[7]\n"
+ "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.h, z0.h, z7.h[7]\n"
+ "fmla z13.h, z0.h, z6.h[7]\n"
+ "fmla z17.h, z0.h, z5.h[7]\n"
+ "fmla z21.h, z0.h, z4.h[7]\n"
+ "fmla z25.h, z0.h, z3.h[7]\n"
+ "fmla z29.h, z0.h, z2.h[7]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.h, z1.h, z7.h[7]\n"
+ "fmla z14.h, z1.h, z6.h[7]\n"
+ "fmla z18.h, z1.h, z5.h[7]\n"
+ "fmla z22.h, z1.h, z4.h[7]\n"
+ "fmla z26.h, z1.h, z3.h[7]\n"
+ "fmla z30.h, z1.h, z2.h[7]\n"
+ "fmla z11.h, z0.h, z7.h[7]\n"
+ "fmla z15.h, z0.h, z6.h[7]\n"
+ "fmla z19.h, z0.h, z5.h[7]\n"
+ "fmla z23.h, z0.h, z4.h[7]\n"
+ "fmla z27.h, z0.h, z3.h[7]\n"
+ "fmla z31.h, z0.h, z2.h[7]\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.h, XZR, x27\n"
@@ -2672,251 +2672,251 @@ void sve_hybrid_fp16_mla_6x4VL (
"ld1rqh { z3.h }, p0/Z, [x23]\n"
"ld1rqh { z4.h }, p0/Z, [x22]\n"
"ld1rqh { z5.h }, p0/Z, [x21]\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z1.h[0]\n"
- "fmla z16.h, z6.h, z2.h[0]\n"
- "fmla z20.h, z6.h, z3.h[0]\n"
- "fmla z24.h, z6.h, z4.h[0]\n"
- "fmla z28.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "fmla z17.h, z7.h, z2.h[0]\n"
- "fmla z21.h, z7.h, z3.h[0]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
- "fmla z29.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[0]\n"
+ "fmla z12.h, z7.h, z1.h[0]\n"
+ "fmla z16.h, z7.h, z2.h[0]\n"
+ "fmla z20.h, z7.h, z3.h[0]\n"
+ "fmla z24.h, z7.h, z4.h[0]\n"
+ "fmla z28.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[0]\n"
+ "fmla z13.h, z6.h, z1.h[0]\n"
+ "fmla z17.h, z6.h, z2.h[0]\n"
+ "fmla z21.h, z6.h, z3.h[0]\n"
+ "fmla z25.h, z6.h, z4.h[0]\n"
+ "fmla z29.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[0]\n"
- "fmla z14.h, z6.h, z1.h[0]\n"
- "fmla z18.h, z6.h, z2.h[0]\n"
- "fmla z22.h, z6.h, z3.h[0]\n"
- "fmla z26.h, z6.h, z4.h[0]\n"
- "fmla z30.h, z6.h, z5.h[0]\n"
- "fmla z11.h, z7.h, z0.h[0]\n"
- "fmla z15.h, z7.h, z1.h[0]\n"
- "fmla z19.h, z7.h, z2.h[0]\n"
- "fmla z23.h, z7.h, z3.h[0]\n"
- "fmla z27.h, z7.h, z4.h[0]\n"
- "fmla z31.h, z7.h, z5.h[0]\n"
+ "fmla z10.h, z7.h, z0.h[0]\n"
+ "fmla z14.h, z7.h, z1.h[0]\n"
+ "fmla z18.h, z7.h, z2.h[0]\n"
+ "fmla z22.h, z7.h, z3.h[0]\n"
+ "fmla z26.h, z7.h, z4.h[0]\n"
+ "fmla z30.h, z7.h, z5.h[0]\n"
+ "fmla z11.h, z6.h, z0.h[0]\n"
+ "fmla z15.h, z6.h, z1.h[0]\n"
+ "fmla z19.h, z6.h, z2.h[0]\n"
+ "fmla z23.h, z6.h, z3.h[0]\n"
+ "fmla z27.h, z6.h, z4.h[0]\n"
+ "fmla z31.h, z6.h, z5.h[0]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[1]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z16.h, z6.h, z2.h[1]\n"
- "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[1]\n"
+ "fmla z12.h, z7.h, z1.h[1]\n"
+ "fmla z16.h, z7.h, z2.h[1]\n"
+ "fmla z20.h, z7.h, z3.h[1]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[1]\n"
- "fmla z28.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[1]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z17.h, z7.h, z2.h[1]\n"
- "fmla z21.h, z7.h, z3.h[1]\n"
- "fmla z25.h, z7.h, z4.h[1]\n"
- "fmla z29.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z7.h, z4.h[1]\n"
+ "fmla z28.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[1]\n"
+ "fmla z13.h, z6.h, z1.h[1]\n"
+ "fmla z17.h, z6.h, z2.h[1]\n"
+ "fmla z21.h, z6.h, z3.h[1]\n"
+ "fmla z25.h, z6.h, z4.h[1]\n"
+ "fmla z29.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[1]\n"
- "fmla z14.h, z6.h, z1.h[1]\n"
- "fmla z18.h, z6.h, z2.h[1]\n"
- "fmla z22.h, z6.h, z3.h[1]\n"
- "fmla z26.h, z6.h, z4.h[1]\n"
- "fmla z30.h, z6.h, z5.h[1]\n"
- "fmla z11.h, z7.h, z0.h[1]\n"
- "fmla z15.h, z7.h, z1.h[1]\n"
- "fmla z19.h, z7.h, z2.h[1]\n"
- "fmla z23.h, z7.h, z3.h[1]\n"
- "fmla z27.h, z7.h, z4.h[1]\n"
- "fmla z31.h, z7.h, z5.h[1]\n"
+ "fmla z10.h, z7.h, z0.h[1]\n"
+ "fmla z14.h, z7.h, z1.h[1]\n"
+ "fmla z18.h, z7.h, z2.h[1]\n"
+ "fmla z22.h, z7.h, z3.h[1]\n"
+ "fmla z26.h, z7.h, z4.h[1]\n"
+ "fmla z30.h, z7.h, z5.h[1]\n"
+ "fmla z11.h, z6.h, z0.h[1]\n"
+ "fmla z15.h, z6.h, z1.h[1]\n"
+ "fmla z19.h, z6.h, z2.h[1]\n"
+ "fmla z23.h, z6.h, z3.h[1]\n"
+ "fmla z27.h, z6.h, z4.h[1]\n"
+ "fmla z31.h, z6.h, z5.h[1]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[2]\n"
- "fmla z12.h, z6.h, z1.h[2]\n"
- "fmla z16.h, z6.h, z2.h[2]\n"
- "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[2]\n"
+ "fmla z12.h, z7.h, z1.h[2]\n"
+ "fmla z16.h, z7.h, z2.h[2]\n"
+ "fmla z20.h, z7.h, z3.h[2]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[2]\n"
- "fmla z28.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[2]\n"
- "fmla z13.h, z7.h, z1.h[2]\n"
- "fmla z17.h, z7.h, z2.h[2]\n"
- "fmla z21.h, z7.h, z3.h[2]\n"
- "fmla z25.h, z7.h, z4.h[2]\n"
- "fmla z29.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z7.h, z4.h[2]\n"
+ "fmla z28.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[2]\n"
+ "fmla z13.h, z6.h, z1.h[2]\n"
+ "fmla z17.h, z6.h, z2.h[2]\n"
+ "fmla z21.h, z6.h, z3.h[2]\n"
+ "fmla z25.h, z6.h, z4.h[2]\n"
+ "fmla z29.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[2]\n"
- "fmla z14.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z2.h[2]\n"
- "fmla z22.h, z6.h, z3.h[2]\n"
- "fmla z26.h, z6.h, z4.h[2]\n"
- "fmla z30.h, z6.h, z5.h[2]\n"
- "fmla z11.h, z7.h, z0.h[2]\n"
- "fmla z15.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z2.h[2]\n"
- "fmla z23.h, z7.h, z3.h[2]\n"
- "fmla z27.h, z7.h, z4.h[2]\n"
- "fmla z31.h, z7.h, z5.h[2]\n"
+ "fmla z10.h, z7.h, z0.h[2]\n"
+ "fmla z14.h, z7.h, z1.h[2]\n"
+ "fmla z18.h, z7.h, z2.h[2]\n"
+ "fmla z22.h, z7.h, z3.h[2]\n"
+ "fmla z26.h, z7.h, z4.h[2]\n"
+ "fmla z30.h, z7.h, z5.h[2]\n"
+ "fmla z11.h, z6.h, z0.h[2]\n"
+ "fmla z15.h, z6.h, z1.h[2]\n"
+ "fmla z19.h, z6.h, z2.h[2]\n"
+ "fmla z23.h, z6.h, z3.h[2]\n"
+ "fmla z27.h, z6.h, z4.h[2]\n"
+ "fmla z31.h, z6.h, z5.h[2]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[3]\n"
- "fmla z12.h, z6.h, z1.h[3]\n"
- "fmla z16.h, z6.h, z2.h[3]\n"
- "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[3]\n"
+ "fmla z12.h, z7.h, z1.h[3]\n"
+ "fmla z16.h, z7.h, z2.h[3]\n"
+ "fmla z20.h, z7.h, z3.h[3]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[3]\n"
- "fmla z28.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[3]\n"
- "fmla z13.h, z7.h, z1.h[3]\n"
- "fmla z17.h, z7.h, z2.h[3]\n"
- "fmla z21.h, z7.h, z3.h[3]\n"
- "fmla z25.h, z7.h, z4.h[3]\n"
- "fmla z29.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z7.h, z4.h[3]\n"
+ "fmla z28.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[3]\n"
+ "fmla z13.h, z6.h, z1.h[3]\n"
+ "fmla z17.h, z6.h, z2.h[3]\n"
+ "fmla z21.h, z6.h, z3.h[3]\n"
+ "fmla z25.h, z6.h, z4.h[3]\n"
+ "fmla z29.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[3]\n"
- "fmla z14.h, z6.h, z1.h[3]\n"
- "fmla z18.h, z6.h, z2.h[3]\n"
- "fmla z22.h, z6.h, z3.h[3]\n"
- "fmla z26.h, z6.h, z4.h[3]\n"
- "fmla z30.h, z6.h, z5.h[3]\n"
- "fmla z11.h, z7.h, z0.h[3]\n"
- "fmla z15.h, z7.h, z1.h[3]\n"
- "fmla z19.h, z7.h, z2.h[3]\n"
- "fmla z23.h, z7.h, z3.h[3]\n"
- "fmla z27.h, z7.h, z4.h[3]\n"
- "fmla z31.h, z7.h, z5.h[3]\n"
+ "fmla z10.h, z7.h, z0.h[3]\n"
+ "fmla z14.h, z7.h, z1.h[3]\n"
+ "fmla z18.h, z7.h, z2.h[3]\n"
+ "fmla z22.h, z7.h, z3.h[3]\n"
+ "fmla z26.h, z7.h, z4.h[3]\n"
+ "fmla z30.h, z7.h, z5.h[3]\n"
+ "fmla z11.h, z6.h, z0.h[3]\n"
+ "fmla z15.h, z6.h, z1.h[3]\n"
+ "fmla z19.h, z6.h, z2.h[3]\n"
+ "fmla z23.h, z6.h, z3.h[3]\n"
+ "fmla z27.h, z6.h, z4.h[3]\n"
+ "fmla z31.h, z6.h, z5.h[3]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[4]\n"
- "fmla z12.h, z6.h, z1.h[4]\n"
- "fmla z16.h, z6.h, z2.h[4]\n"
- "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[4]\n"
+ "fmla z12.h, z7.h, z1.h[4]\n"
+ "fmla z16.h, z7.h, z2.h[4]\n"
+ "fmla z20.h, z7.h, z3.h[4]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[4]\n"
- "fmla z28.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[4]\n"
- "fmla z13.h, z7.h, z1.h[4]\n"
- "fmla z17.h, z7.h, z2.h[4]\n"
- "fmla z21.h, z7.h, z3.h[4]\n"
- "fmla z25.h, z7.h, z4.h[4]\n"
- "fmla z29.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z7.h, z4.h[4]\n"
+ "fmla z28.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[4]\n"
+ "fmla z13.h, z6.h, z1.h[4]\n"
+ "fmla z17.h, z6.h, z2.h[4]\n"
+ "fmla z21.h, z6.h, z3.h[4]\n"
+ "fmla z25.h, z6.h, z4.h[4]\n"
+ "fmla z29.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[4]\n"
- "fmla z14.h, z6.h, z1.h[4]\n"
- "fmla z18.h, z6.h, z2.h[4]\n"
- "fmla z22.h, z6.h, z3.h[4]\n"
- "fmla z26.h, z6.h, z4.h[4]\n"
- "fmla z30.h, z6.h, z5.h[4]\n"
- "fmla z11.h, z7.h, z0.h[4]\n"
- "fmla z15.h, z7.h, z1.h[4]\n"
- "fmla z19.h, z7.h, z2.h[4]\n"
- "fmla z23.h, z7.h, z3.h[4]\n"
- "fmla z27.h, z7.h, z4.h[4]\n"
- "fmla z31.h, z7.h, z5.h[4]\n"
+ "fmla z10.h, z7.h, z0.h[4]\n"
+ "fmla z14.h, z7.h, z1.h[4]\n"
+ "fmla z18.h, z7.h, z2.h[4]\n"
+ "fmla z22.h, z7.h, z3.h[4]\n"
+ "fmla z26.h, z7.h, z4.h[4]\n"
+ "fmla z30.h, z7.h, z5.h[4]\n"
+ "fmla z11.h, z6.h, z0.h[4]\n"
+ "fmla z15.h, z6.h, z1.h[4]\n"
+ "fmla z19.h, z6.h, z2.h[4]\n"
+ "fmla z23.h, z6.h, z3.h[4]\n"
+ "fmla z27.h, z6.h, z4.h[4]\n"
+ "fmla z31.h, z6.h, z5.h[4]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[5]\n"
- "fmla z12.h, z6.h, z1.h[5]\n"
- "fmla z16.h, z6.h, z2.h[5]\n"
- "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[5]\n"
+ "fmla z12.h, z7.h, z1.h[5]\n"
+ "fmla z16.h, z7.h, z2.h[5]\n"
+ "fmla z20.h, z7.h, z3.h[5]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[5]\n"
- "fmla z28.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[5]\n"
- "fmla z13.h, z7.h, z1.h[5]\n"
- "fmla z17.h, z7.h, z2.h[5]\n"
- "fmla z21.h, z7.h, z3.h[5]\n"
- "fmla z25.h, z7.h, z4.h[5]\n"
- "fmla z29.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z7.h, z4.h[5]\n"
+ "fmla z28.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[5]\n"
+ "fmla z13.h, z6.h, z1.h[5]\n"
+ "fmla z17.h, z6.h, z2.h[5]\n"
+ "fmla z21.h, z6.h, z3.h[5]\n"
+ "fmla z25.h, z6.h, z4.h[5]\n"
+ "fmla z29.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[5]\n"
- "fmla z14.h, z6.h, z1.h[5]\n"
- "fmla z18.h, z6.h, z2.h[5]\n"
- "fmla z22.h, z6.h, z3.h[5]\n"
- "fmla z26.h, z6.h, z4.h[5]\n"
- "fmla z30.h, z6.h, z5.h[5]\n"
- "fmla z11.h, z7.h, z0.h[5]\n"
- "fmla z15.h, z7.h, z1.h[5]\n"
- "fmla z19.h, z7.h, z2.h[5]\n"
- "fmla z23.h, z7.h, z3.h[5]\n"
- "fmla z27.h, z7.h, z4.h[5]\n"
- "fmla z31.h, z7.h, z5.h[5]\n"
+ "fmla z10.h, z7.h, z0.h[5]\n"
+ "fmla z14.h, z7.h, z1.h[5]\n"
+ "fmla z18.h, z7.h, z2.h[5]\n"
+ "fmla z22.h, z7.h, z3.h[5]\n"
+ "fmla z26.h, z7.h, z4.h[5]\n"
+ "fmla z30.h, z7.h, z5.h[5]\n"
+ "fmla z11.h, z6.h, z0.h[5]\n"
+ "fmla z15.h, z6.h, z1.h[5]\n"
+ "fmla z19.h, z6.h, z2.h[5]\n"
+ "fmla z23.h, z6.h, z3.h[5]\n"
+ "fmla z27.h, z6.h, z4.h[5]\n"
+ "fmla z31.h, z6.h, z5.h[5]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[6]\n"
- "fmla z12.h, z6.h, z1.h[6]\n"
- "fmla z16.h, z6.h, z2.h[6]\n"
- "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[6]\n"
+ "fmla z12.h, z7.h, z1.h[6]\n"
+ "fmla z16.h, z7.h, z2.h[6]\n"
+ "fmla z20.h, z7.h, z3.h[6]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.h, z6.h, z4.h[6]\n"
- "fmla z28.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[6]\n"
- "fmla z13.h, z7.h, z1.h[6]\n"
- "fmla z17.h, z7.h, z2.h[6]\n"
- "fmla z21.h, z7.h, z3.h[6]\n"
- "fmla z25.h, z7.h, z4.h[6]\n"
- "fmla z29.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, z7.h, z4.h[6]\n"
+ "fmla z28.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[6]\n"
+ "fmla z13.h, z6.h, z1.h[6]\n"
+ "fmla z17.h, z6.h, z2.h[6]\n"
+ "fmla z21.h, z6.h, z3.h[6]\n"
+ "fmla z25.h, z6.h, z4.h[6]\n"
+ "fmla z29.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[6]\n"
- "fmla z14.h, z6.h, z1.h[6]\n"
- "fmla z18.h, z6.h, z2.h[6]\n"
- "fmla z22.h, z6.h, z3.h[6]\n"
- "fmla z26.h, z6.h, z4.h[6]\n"
- "fmla z30.h, z6.h, z5.h[6]\n"
- "fmla z11.h, z7.h, z0.h[6]\n"
- "fmla z15.h, z7.h, z1.h[6]\n"
- "fmla z19.h, z7.h, z2.h[6]\n"
- "fmla z23.h, z7.h, z3.h[6]\n"
- "fmla z27.h, z7.h, z4.h[6]\n"
- "fmla z31.h, z7.h, z5.h[6]\n"
+ "fmla z10.h, z7.h, z0.h[6]\n"
+ "fmla z14.h, z7.h, z1.h[6]\n"
+ "fmla z18.h, z7.h, z2.h[6]\n"
+ "fmla z22.h, z7.h, z3.h[6]\n"
+ "fmla z26.h, z7.h, z4.h[6]\n"
+ "fmla z30.h, z7.h, z5.h[6]\n"
+ "fmla z11.h, z6.h, z0.h[6]\n"
+ "fmla z15.h, z6.h, z1.h[6]\n"
+ "fmla z19.h, z6.h, z2.h[6]\n"
+ "fmla z23.h, z6.h, z3.h[6]\n"
+ "fmla z27.h, z6.h, z4.h[6]\n"
+ "fmla z31.h, z6.h, z5.h[6]\n"
"ble 76f\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.h, z6.h, z0.h[7]\n"
- "fmla z12.h, z6.h, z1.h[7]\n"
- "fmla z16.h, z6.h, z2.h[7]\n"
- "fmla z20.h, z6.h, z3.h[7]\n"
- "fmla z24.h, z6.h, z4.h[7]\n"
- "fmla z28.h, z6.h, z5.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.h, z7.h, z0.h[7]\n"
- "fmla z13.h, z7.h, z1.h[7]\n"
- "fmla z17.h, z7.h, z2.h[7]\n"
- "fmla z21.h, z7.h, z3.h[7]\n"
- "fmla z25.h, z7.h, z4.h[7]\n"
- "fmla z29.h, z7.h, z5.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.h, z7.h, z0.h[7]\n"
+ "fmla z12.h, z7.h, z1.h[7]\n"
+ "fmla z16.h, z7.h, z2.h[7]\n"
+ "fmla z20.h, z7.h, z3.h[7]\n"
+ "fmla z24.h, z7.h, z4.h[7]\n"
+ "fmla z28.h, z7.h, z5.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.h, z6.h, z0.h[7]\n"
+ "fmla z13.h, z6.h, z1.h[7]\n"
+ "fmla z17.h, z6.h, z2.h[7]\n"
+ "fmla z21.h, z6.h, z3.h[7]\n"
+ "fmla z25.h, z6.h, z4.h[7]\n"
+ "fmla z29.h, z6.h, z5.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.h, z6.h, z0.h[7]\n"
- "fmla z14.h, z6.h, z1.h[7]\n"
- "fmla z18.h, z6.h, z2.h[7]\n"
- "fmla z22.h, z6.h, z3.h[7]\n"
- "fmla z26.h, z6.h, z4.h[7]\n"
- "fmla z30.h, z6.h, z5.h[7]\n"
- "fmla z11.h, z7.h, z0.h[7]\n"
- "fmla z15.h, z7.h, z1.h[7]\n"
- "fmla z19.h, z7.h, z2.h[7]\n"
- "fmla z23.h, z7.h, z3.h[7]\n"
- "fmla z27.h, z7.h, z4.h[7]\n"
- "fmla z31.h, z7.h, z5.h[7]\n"
+ "fmla z10.h, z7.h, z0.h[7]\n"
+ "fmla z14.h, z7.h, z1.h[7]\n"
+ "fmla z18.h, z7.h, z2.h[7]\n"
+ "fmla z22.h, z7.h, z3.h[7]\n"
+ "fmla z26.h, z7.h, z4.h[7]\n"
+ "fmla z30.h, z7.h, z5.h[7]\n"
+ "fmla z11.h, z6.h, z0.h[7]\n"
+ "fmla z15.h, z6.h, z1.h[7]\n"
+ "fmla z19.h, z6.h, z2.h[7]\n"
+ "fmla z23.h, z6.h, z3.h[7]\n"
+ "fmla z27.h, z6.h, z4.h[7]\n"
+ "fmla z31.h, z6.h, z5.h[7]\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -3023,7 +3023,6 @@ void sve_hybrid_fp16_mla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -3031,4 +3030,4 @@ void sve_hybrid_fp16_mla_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
index b63b143d4c..880f9d1a27 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -75,13 +75,16 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
- case CPUModel::V1:
- return { 15.65 };
default:
return { 6.667 };
+ case CPUModel::A510:
+ return { 5.41 };
+ case CPUModel::V1:
+ return { 15.65 };
+ case CPUModel::A64FX:
+ return { 25.55 };
}
}
@@ -105,5 +108,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
index 9ae51af59b..66481f04f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -139,11 +139,11 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -159,12 +159,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"9:" // Height 1: Multiply loop: Main loop
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
@@ -174,27 +174,27 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
"addvl x10, x10, #4\n"
"bne 6b\n"
"tbz %x[flags], #1, 11f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z17.s\n"
+ "fmin z9.s, p4/M, z9.s, z17.s\n"
+ "fmin z10.s, p4/M, z10.s, z17.s\n"
+ "fmin z11.s, p4/M, z11.s, z17.s\n"
+ "fmax z8.s, p4/M, z8.s, z16.s\n"
+ "fmax z9.s, p4/M, z9.s, z16.s\n"
+ "fmax z10.s, p4/M, z10.s, z16.s\n"
+ "fmax z11.s, p4/M, z11.s, z16.s\n"
"11:" // Height 1: No activation
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -234,15 +234,15 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"15:" // Height 2: no bias
"tbz %x[flags], #0, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x20]\n"
+ "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 17f\n"
"16:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -258,12 +258,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -271,7 +271,7 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"20:" // Height 2: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -282,18 +282,18 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"21:" // Height 2: Multiply loop: Main loop
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
"add x26, x26, #0x4\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"subs x27, x27, #0x1\n"
"add x25, x25, #0x4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z14.s, p4/M, z17.s, z1.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
+ "fmla z15.s, p4/M, z16.s, z1.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
@@ -303,41 +303,41 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"fmla z8.s, p4/M, z6.s, z0.s\n"
"fmla z12.s, p4/M, z6.s, z1.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
+ "fmla z10.s, p4/M, z17.s, z0.s\n"
+ "fmla z14.s, p4/M, z17.s, z1.s\n"
"addvl x10, x10, #4\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
+ "fmla z11.s, p4/M, z16.s, z0.s\n"
+ "fmla z15.s, p4/M, z16.s, z1.s\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 23f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
+ "ld1rw { z16.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z17.s\n"
+ "fmin z9.s, p4/M, z9.s, z17.s\n"
+ "fmin z10.s, p4/M, z10.s, z17.s\n"
+ "fmin z11.s, p4/M, z11.s, z17.s\n"
+ "fmin z12.s, p4/M, z12.s, z17.s\n"
+ "fmin z13.s, p4/M, z13.s, z17.s\n"
+ "fmin z14.s, p4/M, z14.s, z17.s\n"
+ "fmin z15.s, p4/M, z15.s, z17.s\n"
+ "fmax z8.s, p4/M, z8.s, z16.s\n"
+ "fmax z9.s, p4/M, z9.s, z16.s\n"
+ "fmax z10.s, p4/M, z10.s, z16.s\n"
+ "fmax z11.s, p4/M, z11.s, z16.s\n"
+ "fmax z12.s, p4/M, z12.s, z16.s\n"
+ "fmax z13.s, p4/M, z13.s, z16.s\n"
+ "fmax z14.s, p4/M, z14.s, z16.s\n"
+ "fmax z15.s, p4/M, z15.s, z16.s\n"
"23:" // Height 2: No activation
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -385,20 +385,20 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"27:" // Height 3: no bias
"tbz %x[flags], #0, 28f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x21]\n"
+ "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x20]\n"
+ "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 29f\n"
"28:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -418,13 +418,13 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 32f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -433,8 +433,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"b 32f\n"
"31:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"32:" // Height 3: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -450,21 +450,21 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x24, x24, #0x4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z21.s, z0.s\n"
+ "fmla z14.s, p4/M, z21.s, z1.s\n"
+ "fmla z18.s, p4/M, z21.s, z2.s\n"
+ "fmla z11.s, p4/M, z20.s, z0.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
+ "fmla z15.s, p4/M, z20.s, z1.s\n"
+ "fmla z19.s, p4/M, z20.s, z2.s\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -476,51 +476,51 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
+ "fmla z10.s, p4/M, z21.s, z0.s\n"
+ "fmla z14.s, p4/M, z21.s, z1.s\n"
+ "fmla z18.s, p4/M, z21.s, z2.s\n"
+ "fmla z11.s, p4/M, z20.s, z0.s\n"
+ "fmla z15.s, p4/M, z20.s, z1.s\n"
+ "fmla z19.s, p4/M, z20.s, z2.s\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #2\n"
"add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 35f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z21.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmin z16.s, p4/M, z16.s, z1.s\n"
- "fmin z17.s, p4/M, z17.s, z1.s\n"
- "fmin z18.s, p4/M, z18.s, z1.s\n"
- "fmin z19.s, p4/M, z19.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
- "fmax z16.s, p4/M, z16.s, z0.s\n"
- "fmax z17.s, p4/M, z17.s, z0.s\n"
- "fmax z18.s, p4/M, z18.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z0.s\n"
+ "ld1rw { z20.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z21.s\n"
+ "fmin z9.s, p4/M, z9.s, z21.s\n"
+ "fmin z10.s, p4/M, z10.s, z21.s\n"
+ "fmin z11.s, p4/M, z11.s, z21.s\n"
+ "fmin z12.s, p4/M, z12.s, z21.s\n"
+ "fmin z13.s, p4/M, z13.s, z21.s\n"
+ "fmin z14.s, p4/M, z14.s, z21.s\n"
+ "fmin z15.s, p4/M, z15.s, z21.s\n"
+ "fmin z16.s, p4/M, z16.s, z21.s\n"
+ "fmin z17.s, p4/M, z17.s, z21.s\n"
+ "fmin z18.s, p4/M, z18.s, z21.s\n"
+ "fmin z19.s, p4/M, z19.s, z21.s\n"
+ "fmax z8.s, p4/M, z8.s, z20.s\n"
+ "fmax z9.s, p4/M, z9.s, z20.s\n"
+ "fmax z10.s, p4/M, z10.s, z20.s\n"
+ "fmax z11.s, p4/M, z11.s, z20.s\n"
+ "fmax z12.s, p4/M, z12.s, z20.s\n"
+ "fmax z13.s, p4/M, z13.s, z20.s\n"
+ "fmax z14.s, p4/M, z14.s, z20.s\n"
+ "fmax z15.s, p4/M, z15.s, z20.s\n"
+ "fmax z16.s, p4/M, z16.s, z20.s\n"
+ "fmax z17.s, p4/M, z17.s, z20.s\n"
+ "fmax z18.s, p4/M, z18.s, z20.s\n"
+ "fmax z19.s, p4/M, z19.s, z20.s\n"
"35:" // Height 3: No activation
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -576,25 +576,25 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"39:" // Height 4: no bias
"tbz %x[flags], #0, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x23]\n"
- "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x22]\n"
+ "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x21]\n"
+ "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x20]\n"
+ "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 41f\n"
"40:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -618,14 +618,14 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"42:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 43f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -635,9 +635,9 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"b 44f\n"
"43:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"44:" // Height 4: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -654,7 +654,7 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"subs x27, x27, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
@@ -662,19 +662,19 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x23, x23, #0x4\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
+ "fmla z10.s, p4/M, z25.s, z0.s\n"
+ "fmla z14.s, p4/M, z25.s, z1.s\n"
+ "fmla z18.s, p4/M, z25.s, z2.s\n"
+ "fmla z22.s, p4/M, z25.s, z3.s\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
+ "fmla z11.s, p4/M, z24.s, z0.s\n"
+ "fmla z15.s, p4/M, z24.s, z1.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
+ "fmla z19.s, p4/M, z24.s, z2.s\n"
+ "fmla z23.s, p4/M, z24.s, z3.s\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -686,22 +686,22 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"fmla z16.s, p4/M, z6.s, z2.s\n"
"fmla z20.s, p4/M, z6.s, z3.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
+ "fmla z10.s, p4/M, z25.s, z0.s\n"
+ "fmla z14.s, p4/M, z25.s, z1.s\n"
+ "fmla z18.s, p4/M, z25.s, z2.s\n"
+ "fmla z22.s, p4/M, z25.s, z3.s\n"
+ "fmla z11.s, p4/M, z24.s, z0.s\n"
+ "fmla z15.s, p4/M, z24.s, z1.s\n"
+ "fmla z19.s, p4/M, z24.s, z2.s\n"
+ "fmla z23.s, p4/M, z24.s, z3.s\n"
"bne 42b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #2\n"
@@ -709,41 +709,41 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 47f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z25.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmin z16.s, p4/M, z16.s, z1.s\n"
- "fmin z17.s, p4/M, z17.s, z1.s\n"
- "fmin z18.s, p4/M, z18.s, z1.s\n"
- "fmin z19.s, p4/M, z19.s, z1.s\n"
- "fmin z20.s, p4/M, z20.s, z1.s\n"
- "fmin z21.s, p4/M, z21.s, z1.s\n"
- "fmin z22.s, p4/M, z22.s, z1.s\n"
- "fmin z23.s, p4/M, z23.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
- "fmax z16.s, p4/M, z16.s, z0.s\n"
- "fmax z17.s, p4/M, z17.s, z0.s\n"
- "fmax z18.s, p4/M, z18.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z0.s\n"
- "fmax z20.s, p4/M, z20.s, z0.s\n"
- "fmax z21.s, p4/M, z21.s, z0.s\n"
- "fmax z22.s, p4/M, z22.s, z0.s\n"
- "fmax z23.s, p4/M, z23.s, z0.s\n"
+ "ld1rw { z24.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z25.s\n"
+ "fmin z9.s, p4/M, z9.s, z25.s\n"
+ "fmin z10.s, p4/M, z10.s, z25.s\n"
+ "fmin z11.s, p4/M, z11.s, z25.s\n"
+ "fmin z12.s, p4/M, z12.s, z25.s\n"
+ "fmin z13.s, p4/M, z13.s, z25.s\n"
+ "fmin z14.s, p4/M, z14.s, z25.s\n"
+ "fmin z15.s, p4/M, z15.s, z25.s\n"
+ "fmin z16.s, p4/M, z16.s, z25.s\n"
+ "fmin z17.s, p4/M, z17.s, z25.s\n"
+ "fmin z18.s, p4/M, z18.s, z25.s\n"
+ "fmin z19.s, p4/M, z19.s, z25.s\n"
+ "fmin z20.s, p4/M, z20.s, z25.s\n"
+ "fmin z21.s, p4/M, z21.s, z25.s\n"
+ "fmin z22.s, p4/M, z22.s, z25.s\n"
+ "fmin z23.s, p4/M, z23.s, z25.s\n"
+ "fmax z8.s, p4/M, z8.s, z24.s\n"
+ "fmax z9.s, p4/M, z9.s, z24.s\n"
+ "fmax z10.s, p4/M, z10.s, z24.s\n"
+ "fmax z11.s, p4/M, z11.s, z24.s\n"
+ "fmax z12.s, p4/M, z12.s, z24.s\n"
+ "fmax z13.s, p4/M, z13.s, z24.s\n"
+ "fmax z14.s, p4/M, z14.s, z24.s\n"
+ "fmax z15.s, p4/M, z15.s, z24.s\n"
+ "fmax z16.s, p4/M, z16.s, z24.s\n"
+ "fmax z17.s, p4/M, z17.s, z24.s\n"
+ "fmax z18.s, p4/M, z18.s, z24.s\n"
+ "fmax z19.s, p4/M, z19.s, z24.s\n"
+ "fmax z20.s, p4/M, z20.s, z24.s\n"
+ "fmax z21.s, p4/M, z21.s, z24.s\n"
+ "fmax z22.s, p4/M, z22.s, z24.s\n"
+ "fmax z23.s, p4/M, z23.s, z24.s\n"
"47:" // Height 4: No activation
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -807,30 +807,30 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"51:" // Height 5: no bias
"tbz %x[flags], #0, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x23]\n"
- "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p3/Z, [x22]\n"
- "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x23]\n"
+ "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x22]\n"
+ "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x21]\n"
+ "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p3/Z, [x20]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 53f\n"
"52:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -858,15 +858,15 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"54:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 55f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -877,10 +877,10 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"b 56f\n"
"55:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"56:" // Height 5: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -902,29 +902,29 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x24, x24, #0x4\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x4\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"add x22, x22, #0x4\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
- "fmla z26.s, p4/M, z6.s, z4.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
+ "fmla z10.s, p4/M, z29.s, z0.s\n"
+ "fmla z14.s, p4/M, z29.s, z1.s\n"
+ "fmla z18.s, p4/M, z29.s, z2.s\n"
+ "fmla z22.s, p4/M, z29.s, z3.s\n"
+ "fmla z26.s, p4/M, z29.s, z4.s\n"
+ "fmla z11.s, p4/M, z28.s, z0.s\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1w { z6.s }, p4/Z, [x10]\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
+ "fmla z15.s, p4/M, z28.s, z1.s\n"
+ "fmla z19.s, p4/M, z28.s, z2.s\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
- "fmla z27.s, p4/M, z7.s, z4.s\n"
+ "fmla z23.s, p4/M, z28.s, z3.s\n"
+ "fmla z27.s, p4/M, z28.s, z4.s\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -939,23 +939,23 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"cmp x28, x20\n"
"fmla z24.s, p4/M, z6.s, z4.s\n"
"fmla z9.s, p4/M, z7.s, z0.s\n"
- "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, p4/M, z7.s, z1.s\n"
"fmla z17.s, p4/M, z7.s, z2.s\n"
"fmla z21.s, p4/M, z7.s, z3.s\n"
"fmla z25.s, p4/M, z7.s, z4.s\n"
- "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, p4/M, z6.s, z0.s\n"
- "fmla z14.s, p4/M, z6.s, z1.s\n"
- "fmla z18.s, p4/M, z6.s, z2.s\n"
- "fmla z22.s, p4/M, z6.s, z3.s\n"
- "fmla z26.s, p4/M, z6.s, z4.s\n"
- "fmla z11.s, p4/M, z7.s, z0.s\n"
- "fmla z15.s, p4/M, z7.s, z1.s\n"
- "fmla z19.s, p4/M, z7.s, z2.s\n"
- "fmla z23.s, p4/M, z7.s, z3.s\n"
- "fmla z27.s, p4/M, z7.s, z4.s\n"
+ "fmla z10.s, p4/M, z29.s, z0.s\n"
+ "fmla z14.s, p4/M, z29.s, z1.s\n"
+ "fmla z18.s, p4/M, z29.s, z2.s\n"
+ "fmla z22.s, p4/M, z29.s, z3.s\n"
+ "fmla z26.s, p4/M, z29.s, z4.s\n"
+ "fmla z11.s, p4/M, z28.s, z0.s\n"
+ "fmla z15.s, p4/M, z28.s, z1.s\n"
+ "fmla z19.s, p4/M, z28.s, z2.s\n"
+ "fmla z23.s, p4/M, z28.s, z3.s\n"
+ "fmla z27.s, p4/M, z28.s, z4.s\n"
"bne 54b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x25, x9, x20, LSL #2\n"
@@ -964,49 +964,49 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 59f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p4/Z, [x20]\n"
+ "ld1rw { z29.s }, p4/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p4/Z, [x20]\n"
- "fmin z8.s, p4/M, z8.s, z1.s\n"
- "fmin z9.s, p4/M, z9.s, z1.s\n"
- "fmin z10.s, p4/M, z10.s, z1.s\n"
- "fmin z11.s, p4/M, z11.s, z1.s\n"
- "fmin z12.s, p4/M, z12.s, z1.s\n"
- "fmin z13.s, p4/M, z13.s, z1.s\n"
- "fmin z14.s, p4/M, z14.s, z1.s\n"
- "fmin z15.s, p4/M, z15.s, z1.s\n"
- "fmin z16.s, p4/M, z16.s, z1.s\n"
- "fmin z17.s, p4/M, z17.s, z1.s\n"
- "fmin z18.s, p4/M, z18.s, z1.s\n"
- "fmin z19.s, p4/M, z19.s, z1.s\n"
- "fmin z20.s, p4/M, z20.s, z1.s\n"
- "fmin z21.s, p4/M, z21.s, z1.s\n"
- "fmin z22.s, p4/M, z22.s, z1.s\n"
- "fmin z23.s, p4/M, z23.s, z1.s\n"
- "fmin z24.s, p4/M, z24.s, z1.s\n"
- "fmin z25.s, p4/M, z25.s, z1.s\n"
- "fmin z26.s, p4/M, z26.s, z1.s\n"
- "fmin z27.s, p4/M, z27.s, z1.s\n"
- "fmax z8.s, p4/M, z8.s, z0.s\n"
- "fmax z9.s, p4/M, z9.s, z0.s\n"
- "fmax z10.s, p4/M, z10.s, z0.s\n"
- "fmax z11.s, p4/M, z11.s, z0.s\n"
- "fmax z12.s, p4/M, z12.s, z0.s\n"
- "fmax z13.s, p4/M, z13.s, z0.s\n"
- "fmax z14.s, p4/M, z14.s, z0.s\n"
- "fmax z15.s, p4/M, z15.s, z0.s\n"
- "fmax z16.s, p4/M, z16.s, z0.s\n"
- "fmax z17.s, p4/M, z17.s, z0.s\n"
- "fmax z18.s, p4/M, z18.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z0.s\n"
- "fmax z20.s, p4/M, z20.s, z0.s\n"
- "fmax z21.s, p4/M, z21.s, z0.s\n"
- "fmax z22.s, p4/M, z22.s, z0.s\n"
- "fmax z23.s, p4/M, z23.s, z0.s\n"
- "fmax z24.s, p4/M, z24.s, z0.s\n"
- "fmax z25.s, p4/M, z25.s, z0.s\n"
- "fmax z26.s, p4/M, z26.s, z0.s\n"
- "fmax z27.s, p4/M, z27.s, z0.s\n"
+ "ld1rw { z28.s }, p4/Z, [x20]\n"
+ "fmin z8.s, p4/M, z8.s, z29.s\n"
+ "fmin z9.s, p4/M, z9.s, z29.s\n"
+ "fmin z10.s, p4/M, z10.s, z29.s\n"
+ "fmin z11.s, p4/M, z11.s, z29.s\n"
+ "fmin z12.s, p4/M, z12.s, z29.s\n"
+ "fmin z13.s, p4/M, z13.s, z29.s\n"
+ "fmin z14.s, p4/M, z14.s, z29.s\n"
+ "fmin z15.s, p4/M, z15.s, z29.s\n"
+ "fmin z16.s, p4/M, z16.s, z29.s\n"
+ "fmin z17.s, p4/M, z17.s, z29.s\n"
+ "fmin z18.s, p4/M, z18.s, z29.s\n"
+ "fmin z19.s, p4/M, z19.s, z29.s\n"
+ "fmin z20.s, p4/M, z20.s, z29.s\n"
+ "fmin z21.s, p4/M, z21.s, z29.s\n"
+ "fmin z22.s, p4/M, z22.s, z29.s\n"
+ "fmin z23.s, p4/M, z23.s, z29.s\n"
+ "fmin z24.s, p4/M, z24.s, z29.s\n"
+ "fmin z25.s, p4/M, z25.s, z29.s\n"
+ "fmin z26.s, p4/M, z26.s, z29.s\n"
+ "fmin z27.s, p4/M, z27.s, z29.s\n"
+ "fmax z8.s, p4/M, z8.s, z28.s\n"
+ "fmax z9.s, p4/M, z9.s, z28.s\n"
+ "fmax z10.s, p4/M, z10.s, z28.s\n"
+ "fmax z11.s, p4/M, z11.s, z28.s\n"
+ "fmax z12.s, p4/M, z12.s, z28.s\n"
+ "fmax z13.s, p4/M, z13.s, z28.s\n"
+ "fmax z14.s, p4/M, z14.s, z28.s\n"
+ "fmax z15.s, p4/M, z15.s, z28.s\n"
+ "fmax z16.s, p4/M, z16.s, z28.s\n"
+ "fmax z17.s, p4/M, z17.s, z28.s\n"
+ "fmax z18.s, p4/M, z18.s, z28.s\n"
+ "fmax z19.s, p4/M, z19.s, z28.s\n"
+ "fmax z20.s, p4/M, z20.s, z28.s\n"
+ "fmax z21.s, p4/M, z21.s, z28.s\n"
+ "fmax z22.s, p4/M, z22.s, z28.s\n"
+ "fmax z23.s, p4/M, z23.s, z28.s\n"
+ "fmax z24.s, p4/M, z24.s, z28.s\n"
+ "fmax z25.s, p4/M, z25.s, z28.s\n"
+ "fmax z26.s, p4/M, z26.s, z28.s\n"
+ "fmax z27.s, p4/M, z27.s, z28.s\n"
"59:" // Height 5: No activation
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -1081,35 +1081,35 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"63:" // Height 6: no bias
"tbz %x[flags], #0, 64f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x25]\n"
- "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x23]\n"
- "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p3/Z, [x22]\n"
- "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p3/Z, [x21]\n"
- "ld1w { z29.s }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x24]\n"
+ "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x23]\n"
+ "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x22]\n"
+ "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p3/Z, [x21]\n"
+ "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p3/Z, [x20]\n"
+ "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 65f\n"
"64:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1141,16 +1141,16 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"66:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 67f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 68f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1162,11 +1162,11 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"b 68f\n"
"67:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"68:" // Height 6: input setup done
"subs x27, x27, #0x1\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1355,7 +1355,6 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"74:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1363,4 +1362,4 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
index 71c6afba42..e1581f2026 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -139,11 +139,11 @@ void sve_hybrid_fp32_mla_6x4VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -156,87 +156,87 @@ void sve_hybrid_fp32_mla_6x4VL (
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z8.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z10.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z11.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[2]\n"
+ "fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.s, z17.s, z0.s[2]\n"
+ "fmla z11.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[3]\n"
+ "fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z10.s, z17.s, z0.s[3]\n"
+ "fmla z11.s, z16.s, z0.s[3]\n"
"add x26, x26, #0x10\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z17.s, z0.s[0]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[1]\n"
+ "fmla z9.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z10.s, z17.s, z0.s[1]\n"
+ "fmla z11.s, z16.s, z0.s[1]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[2]\n"
+ "fmla z9.s, z16.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z10.s, z17.s, z0.s[2]\n"
+ "fmla z11.s, z16.s, z0.s[2]\n"
"addvl x10, x10, #4\n"
"ble 11f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[3]\n"
+ "fmla z9.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z17.s, z0.s[3]\n"
+ "fmla z11.s, z16.s, z0.s[3]\n"
"addvl x10, x10, #4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -245,17 +245,17 @@ void sve_hybrid_fp32_mla_6x4VL (
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"12:" // Height 1: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -295,15 +295,15 @@ void sve_hybrid_fp32_mla_6x4VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 18f\n"
"17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
@@ -319,12 +319,12 @@ void sve_hybrid_fp32_mla_6x4VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -332,130 +332,130 @@ void sve_hybrid_fp32_mla_6x4VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"21:" // Height 2: input setup done
"cmp x27, #0x4\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[0]\n"
+ "fmla z12.s, z17.s, z0.s[0]\n"
+ "fmla z9.s, z16.s, z1.s[0]\n"
+ "fmla z13.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z17.s, z1.s[0]\n"
+ "fmla z14.s, z17.s, z0.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #4, MUL VL]\n"
"cmp x27, #0x4\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z11.s, z16.s, z1.s[0]\n"
+ "fmla z15.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
"add x26, x26, #0x10\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[1]\n"
+ "fmla z12.s, z17.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #6, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z9.s, z16.s, z1.s[1]\n"
+ "fmla z13.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z10.s, z17.s, z1.s[1]\n"
+ "fmla z14.s, z17.s, z0.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.s, z16.s, z1.s[1]\n"
+ "fmla z15.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[2]\n"
+ "fmla z12.s, z17.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.s, z16.s, z1.s[2]\n"
+ "fmla z13.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.s, z17.s, z1.s[2]\n"
+ "fmla z14.s, z17.s, z0.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.s, z16.s, z1.s[2]\n"
+ "fmla z15.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.s, z17.s, z1.s[3]\n"
+ "fmla z12.s, z17.s, z0.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.s, z16.s, z1.s[3]\n"
+ "fmla z13.s, z16.s, z0.s[3]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.s, z17.s, z1.s[3]\n"
+ "fmla z14.s, z17.s, z0.s[3]\n"
+ "fmla z11.s, z16.s, z1.s[3]\n"
+ "fmla z15.s, z16.s, z0.s[3]\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
"ld1rqw { z0.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[0]\n"
+ "fmla z12.s, z17.s, z1.s[0]\n"
+ "fmla z9.s, z16.s, z0.s[0]\n"
+ "fmla z13.s, z16.s, z1.s[0]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z17.s, z0.s[0]\n"
+ "fmla z14.s, z17.s, z1.s[0]\n"
"addvl x10, x10, #4\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z11.s, z16.s, z0.s[0]\n"
+ "fmla z15.s, z16.s, z1.s[0]\n"
"ble 24f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[1]\n"
+ "fmla z12.s, z17.s, z1.s[1]\n"
+ "fmla z9.s, z16.s, z0.s[1]\n"
+ "fmla z13.s, z16.s, z1.s[1]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z10.s, z17.s, z0.s[1]\n"
+ "fmla z14.s, z17.s, z1.s[1]\n"
"addvl x10, x10, #4\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z11.s, z16.s, z0.s[1]\n"
+ "fmla z15.s, z16.s, z1.s[1]\n"
"ble 24f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[2]\n"
+ "fmla z12.s, z17.s, z1.s[2]\n"
+ "fmla z9.s, z16.s, z0.s[2]\n"
+ "fmla z13.s, z16.s, z1.s[2]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z10.s, z17.s, z0.s[2]\n"
+ "fmla z14.s, z17.s, z1.s[2]\n"
"addvl x10, x10, #4\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z11.s, z16.s, z0.s[2]\n"
+ "fmla z15.s, z16.s, z1.s[2]\n"
"ble 24f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z17.s, z0.s[3]\n"
+ "fmla z12.s, z17.s, z1.s[3]\n"
+ "fmla z9.s, z16.s, z0.s[3]\n"
+ "fmla z13.s, z16.s, z1.s[3]\n"
+ "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z17.s, z0.s[3]\n"
+ "fmla z14.s, z17.s, z1.s[3]\n"
"addvl x10, x10, #4\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z11.s, z16.s, z0.s[3]\n"
+ "fmla z15.s, z16.s, z1.s[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -465,25 +465,25 @@ void sve_hybrid_fp32_mla_6x4VL (
"add x25, x9, x20, LSL #2\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmin z12.s, p5/M, z12.s, z17.s\n"
+ "fmin z13.s, p5/M, z13.s, z17.s\n"
+ "fmin z14.s, p5/M, z14.s, z17.s\n"
+ "fmin z15.s, p5/M, z15.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
+ "fmax z12.s, p5/M, z12.s, z16.s\n"
+ "fmax z13.s, p5/M, z13.s, z16.s\n"
+ "fmax z14.s, p5/M, z14.s, z16.s\n"
+ "fmax z15.s, p5/M, z15.s, z16.s\n"
"25:" // Height 2: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -531,20 +531,20 @@ void sve_hybrid_fp32_mla_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20]\n"
+ "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 31f\n"
"30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -564,13 +564,13 @@ void sve_hybrid_fp32_mla_6x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -579,86 +579,86 @@ void sve_hybrid_fp32_mla_6x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"34:" // Height 3: input setup done
"cmp x27, #0x4\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z1.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1rqw { z0.s }, p0/Z, [x24]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z21.s, z2.s[0]\n"
+ "fmla z12.s, z21.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.s, z21.s, z0.s[0]\n"
+ "fmla z9.s, z20.s, z2.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
+ "fmla z17.s, z20.s, z0.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
"cmp x27, #0x4\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z10.s, z21.s, z2.s[0]\n"
+ "fmla z14.s, z21.s, z1.s[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z18.s, z21.s, z0.s[0]\n"
+ "fmla z11.s, z20.s, z2.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z15.s, z20.s, z1.s[0]\n"
+ "fmla z19.s, z20.s, z0.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.s, z21.s, z2.s[1]\n"
+ "fmla z12.s, z21.s, z1.s[1]\n"
+ "fmla z16.s, z21.s, z0.s[1]\n"
+ "fmla z9.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[1]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z10.s, z21.s, z2.s[1]\n"
+ "fmla z14.s, z21.s, z1.s[1]\n"
+ "fmla z18.s, z21.s, z0.s[1]\n"
+ "fmla z11.s, z20.s, z2.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.s, z20.s, z1.s[1]\n"
+ "fmla z19.s, z20.s, z0.s[1]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.s, z21.s, z2.s[2]\n"
+ "fmla z12.s, z21.s, z1.s[2]\n"
+ "fmla z16.s, z21.s, z0.s[2]\n"
+ "fmla z9.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[2]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.s, z21.s, z2.s[2]\n"
+ "fmla z14.s, z21.s, z1.s[2]\n"
+ "fmla z18.s, z21.s, z0.s[2]\n"
+ "fmla z11.s, z20.s, z2.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z15.s, z20.s, z1.s[2]\n"
+ "fmla z19.s, z20.s, z0.s[2]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.s, z21.s, z2.s[3]\n"
+ "fmla z12.s, z21.s, z1.s[3]\n"
+ "fmla z16.s, z21.s, z0.s[3]\n"
+ "fmla z9.s, z20.s, z2.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[3]\n"
+ "fmla z17.s, z20.s, z0.s[3]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.s, z21.s, z2.s[3]\n"
+ "fmla z14.s, z21.s, z1.s[3]\n"
+ "fmla z18.s, z21.s, z0.s[3]\n"
+ "fmla z11.s, z20.s, z2.s[3]\n"
+ "fmla z15.s, z20.s, z1.s[3]\n"
+ "fmla z19.s, z20.s, z0.s[3]\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -666,79 +666,79 @@ void sve_hybrid_fp32_mla_6x4VL (
"ld1rqw { z1.s }, p0/Z, [x25]\n"
"subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z21.s, z0.s[0]\n"
+ "fmla z12.s, z21.s, z1.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.s, z21.s, z2.s[0]\n"
+ "fmla z9.s, z20.s, z0.s[0]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[0]\n"
+ "fmla z17.s, z20.s, z2.s[0]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z10.s, z21.s, z0.s[0]\n"
+ "fmla z14.s, z21.s, z1.s[0]\n"
+ "fmla z18.s, z21.s, z2.s[0]\n"
+ "fmla z11.s, z20.s, z0.s[0]\n"
+ "fmla z15.s, z20.s, z1.s[0]\n"
+ "fmla z19.s, z20.s, z2.s[0]\n"
"ble 37f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z21.s, z0.s[1]\n"
+ "fmla z12.s, z21.s, z1.s[1]\n"
+ "fmla z16.s, z21.s, z2.s[1]\n"
+ "fmla z9.s, z20.s, z0.s[1]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[1]\n"
+ "fmla z17.s, z20.s, z2.s[1]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z10.s, z21.s, z0.s[1]\n"
+ "fmla z14.s, z21.s, z1.s[1]\n"
+ "fmla z18.s, z21.s, z2.s[1]\n"
+ "fmla z11.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z1.s[1]\n"
+ "fmla z19.s, z20.s, z2.s[1]\n"
"ble 37f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z21.s, z0.s[2]\n"
+ "fmla z12.s, z21.s, z1.s[2]\n"
+ "fmla z16.s, z21.s, z2.s[2]\n"
+ "fmla z9.s, z20.s, z0.s[2]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[2]\n"
+ "fmla z17.s, z20.s, z2.s[2]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z10.s, z21.s, z0.s[2]\n"
+ "fmla z14.s, z21.s, z1.s[2]\n"
+ "fmla z18.s, z21.s, z2.s[2]\n"
+ "fmla z11.s, z20.s, z0.s[2]\n"
+ "fmla z15.s, z20.s, z1.s[2]\n"
+ "fmla z19.s, z20.s, z2.s[2]\n"
"ble 37f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z21.s }, p5/Z, [x10]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z21.s, z0.s[3]\n"
+ "fmla z12.s, z21.s, z1.s[3]\n"
+ "fmla z16.s, z21.s, z2.s[3]\n"
+ "fmla z9.s, z20.s, z0.s[3]\n"
+ "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z20.s, z1.s[3]\n"
+ "fmla z17.s, z20.s, z2.s[3]\n"
+ "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z10.s, z21.s, z0.s[3]\n"
+ "fmla z14.s, z21.s, z1.s[3]\n"
+ "fmla z18.s, z21.s, z2.s[3]\n"
+ "fmla z11.s, z20.s, z0.s[3]\n"
+ "fmla z15.s, z20.s, z1.s[3]\n"
+ "fmla z19.s, z20.s, z2.s[3]\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -749,33 +749,33 @@ void sve_hybrid_fp32_mla_6x4VL (
"add x24, x25, x20, LSL #2\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z21.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z20.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z21.s\n"
+ "fmin z9.s, p5/M, z9.s, z21.s\n"
+ "fmin z10.s, p5/M, z10.s, z21.s\n"
+ "fmin z11.s, p5/M, z11.s, z21.s\n"
+ "fmin z12.s, p5/M, z12.s, z21.s\n"
+ "fmin z13.s, p5/M, z13.s, z21.s\n"
+ "fmin z14.s, p5/M, z14.s, z21.s\n"
+ "fmin z15.s, p5/M, z15.s, z21.s\n"
+ "fmin z16.s, p5/M, z16.s, z21.s\n"
+ "fmin z17.s, p5/M, z17.s, z21.s\n"
+ "fmin z18.s, p5/M, z18.s, z21.s\n"
+ "fmin z19.s, p5/M, z19.s, z21.s\n"
+ "fmax z8.s, p5/M, z8.s, z20.s\n"
+ "fmax z9.s, p5/M, z9.s, z20.s\n"
+ "fmax z10.s, p5/M, z10.s, z20.s\n"
+ "fmax z11.s, p5/M, z11.s, z20.s\n"
+ "fmax z12.s, p5/M, z12.s, z20.s\n"
+ "fmax z13.s, p5/M, z13.s, z20.s\n"
+ "fmax z14.s, p5/M, z14.s, z20.s\n"
+ "fmax z15.s, p5/M, z15.s, z20.s\n"
+ "fmax z16.s, p5/M, z16.s, z20.s\n"
+ "fmax z17.s, p5/M, z17.s, z20.s\n"
+ "fmax z18.s, p5/M, z18.s, z20.s\n"
+ "fmax z19.s, p5/M, z19.s, z20.s\n"
"38:" // Height 3: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -831,25 +831,25 @@ void sve_hybrid_fp32_mla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21]\n"
+ "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 44f\n"
"43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -873,14 +873,14 @@ void sve_hybrid_fp32_mla_6x4VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -890,105 +890,105 @@ void sve_hybrid_fp32_mla_6x4VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"47:" // Height 4: input setup done
"cmp x27, #0x4\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z3.s }, p0/Z, [x26]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[0]\n"
+ "fmla z12.s, z25.s, z2.s[0]\n"
+ "fmla z16.s, z25.s, z1.s[0]\n"
+ "fmla z20.s, z25.s, z0.s[0]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z9.s, z24.s, z3.s[0]\n"
+ "fmla z13.s, z24.s, z2.s[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z17.s, z24.s, z1.s[0]\n"
+ "fmla z21.s, z24.s, z0.s[0]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z25.s, z3.s[0]\n"
+ "fmla z14.s, z25.s, z2.s[0]\n"
+ "fmla z18.s, z25.s, z1.s[0]\n"
+ "fmla z22.s, z25.s, z0.s[0]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.s, z24.s, z3.s[0]\n"
+ "fmla z15.s, z24.s, z2.s[0]\n"
+ "fmla z19.s, z24.s, z1.s[0]\n"
+ "fmla z23.s, z24.s, z0.s[0]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[1]\n"
+ "fmla z12.s, z25.s, z2.s[1]\n"
+ "fmla z16.s, z25.s, z1.s[1]\n"
+ "fmla z20.s, z25.s, z0.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.s, z24.s, z3.s[1]\n"
+ "fmla z13.s, z24.s, z2.s[1]\n"
+ "fmla z17.s, z24.s, z1.s[1]\n"
+ "fmla z21.s, z24.s, z0.s[1]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z10.s, z25.s, z3.s[1]\n"
+ "fmla z14.s, z25.s, z2.s[1]\n"
+ "fmla z18.s, z25.s, z1.s[1]\n"
+ "fmla z22.s, z25.s, z0.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.s, z24.s, z3.s[1]\n"
+ "fmla z15.s, z24.s, z2.s[1]\n"
+ "fmla z19.s, z24.s, z1.s[1]\n"
+ "fmla z23.s, z24.s, z0.s[1]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[2]\n"
+ "fmla z12.s, z25.s, z2.s[2]\n"
+ "fmla z16.s, z25.s, z1.s[2]\n"
+ "fmla z20.s, z25.s, z0.s[2]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.s, z24.s, z3.s[2]\n"
+ "fmla z13.s, z24.s, z2.s[2]\n"
+ "fmla z17.s, z24.s, z1.s[2]\n"
+ "fmla z21.s, z24.s, z0.s[2]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.s, z25.s, z3.s[2]\n"
+ "fmla z14.s, z25.s, z2.s[2]\n"
+ "fmla z18.s, z25.s, z1.s[2]\n"
+ "fmla z22.s, z25.s, z0.s[2]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.s, z24.s, z3.s[2]\n"
+ "fmla z15.s, z24.s, z2.s[2]\n"
+ "fmla z19.s, z24.s, z1.s[2]\n"
+ "fmla z23.s, z24.s, z0.s[2]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.s, z25.s, z3.s[3]\n"
+ "fmla z12.s, z25.s, z2.s[3]\n"
+ "fmla z16.s, z25.s, z1.s[3]\n"
+ "fmla z20.s, z25.s, z0.s[3]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.s, z24.s, z3.s[3]\n"
+ "fmla z13.s, z24.s, z2.s[3]\n"
+ "fmla z17.s, z24.s, z1.s[3]\n"
+ "fmla z21.s, z24.s, z0.s[3]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.s, z25.s, z3.s[3]\n"
+ "fmla z14.s, z25.s, z2.s[3]\n"
+ "fmla z18.s, z25.s, z1.s[3]\n"
+ "fmla z22.s, z25.s, z0.s[3]\n"
+ "fmla z11.s, z24.s, z3.s[3]\n"
+ "fmla z15.s, z24.s, z2.s[3]\n"
+ "fmla z19.s, z24.s, z1.s[3]\n"
+ "fmla z23.s, z24.s, z0.s[3]\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -997,95 +997,95 @@ void sve_hybrid_fp32_mla_6x4VL (
"subs x27, x27, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z25.s, z0.s[0]\n"
+ "fmla z12.s, z25.s, z1.s[0]\n"
+ "fmla z16.s, z25.s, z2.s[0]\n"
+ "fmla z20.s, z25.s, z3.s[0]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z24.s, z0.s[0]\n"
+ "fmla z13.s, z24.s, z1.s[0]\n"
+ "fmla z17.s, z24.s, z2.s[0]\n"
+ "fmla z21.s, z24.s, z3.s[0]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z10.s, z25.s, z0.s[0]\n"
+ "fmla z14.s, z25.s, z1.s[0]\n"
+ "fmla z18.s, z25.s, z2.s[0]\n"
+ "fmla z22.s, z25.s, z3.s[0]\n"
+ "fmla z11.s, z24.s, z0.s[0]\n"
+ "fmla z15.s, z24.s, z1.s[0]\n"
+ "fmla z19.s, z24.s, z2.s[0]\n"
+ "fmla z23.s, z24.s, z3.s[0]\n"
"ble 50f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z25.s, z0.s[1]\n"
+ "fmla z12.s, z25.s, z1.s[1]\n"
+ "fmla z16.s, z25.s, z2.s[1]\n"
+ "fmla z20.s, z25.s, z3.s[1]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.s, z24.s, z0.s[1]\n"
+ "fmla z13.s, z24.s, z1.s[1]\n"
+ "fmla z17.s, z24.s, z2.s[1]\n"
+ "fmla z21.s, z24.s, z3.s[1]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z10.s, z25.s, z0.s[1]\n"
+ "fmla z14.s, z25.s, z1.s[1]\n"
+ "fmla z18.s, z25.s, z2.s[1]\n"
+ "fmla z22.s, z25.s, z3.s[1]\n"
+ "fmla z11.s, z24.s, z0.s[1]\n"
+ "fmla z15.s, z24.s, z1.s[1]\n"
+ "fmla z19.s, z24.s, z2.s[1]\n"
+ "fmla z23.s, z24.s, z3.s[1]\n"
"ble 50f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z25.s, z0.s[2]\n"
+ "fmla z12.s, z25.s, z1.s[2]\n"
+ "fmla z16.s, z25.s, z2.s[2]\n"
+ "fmla z20.s, z25.s, z3.s[2]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x1\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z9.s, z24.s, z0.s[2]\n"
+ "fmla z13.s, z24.s, z1.s[2]\n"
+ "fmla z17.s, z24.s, z2.s[2]\n"
+ "fmla z21.s, z24.s, z3.s[2]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z10.s, z25.s, z0.s[2]\n"
+ "fmla z14.s, z25.s, z1.s[2]\n"
+ "fmla z18.s, z25.s, z2.s[2]\n"
+ "fmla z22.s, z25.s, z3.s[2]\n"
+ "fmla z11.s, z24.s, z0.s[2]\n"
+ "fmla z15.s, z24.s, z1.s[2]\n"
+ "fmla z19.s, z24.s, z2.s[2]\n"
+ "fmla z23.s, z24.s, z3.s[2]\n"
"ble 50f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z25.s }, p5/Z, [x10]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z25.s, z0.s[3]\n"
+ "fmla z12.s, z25.s, z1.s[3]\n"
+ "fmla z16.s, z25.s, z2.s[3]\n"
+ "fmla z20.s, z25.s, z3.s[3]\n"
+ "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z24.s, z0.s[3]\n"
+ "fmla z13.s, z24.s, z1.s[3]\n"
+ "fmla z17.s, z24.s, z2.s[3]\n"
+ "fmla z21.s, z24.s, z3.s[3]\n"
+ "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z10.s, z25.s, z0.s[3]\n"
+ "fmla z14.s, z25.s, z1.s[3]\n"
+ "fmla z18.s, z25.s, z2.s[3]\n"
+ "fmla z22.s, z25.s, z3.s[3]\n"
+ "fmla z11.s, z24.s, z0.s[3]\n"
+ "fmla z15.s, z24.s, z1.s[3]\n"
+ "fmla z19.s, z24.s, z2.s[3]\n"
+ "fmla z23.s, z24.s, z3.s[3]\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1097,41 +1097,41 @@ void sve_hybrid_fp32_mla_6x4VL (
"add x23, x24, x20, LSL #2\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z23.s, p5/M, z23.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z23.s, p5/M, z23.s, z0.s\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z25.s\n"
+ "fmin z9.s, p5/M, z9.s, z25.s\n"
+ "fmin z10.s, p5/M, z10.s, z25.s\n"
+ "fmin z11.s, p5/M, z11.s, z25.s\n"
+ "fmin z12.s, p5/M, z12.s, z25.s\n"
+ "fmin z13.s, p5/M, z13.s, z25.s\n"
+ "fmin z14.s, p5/M, z14.s, z25.s\n"
+ "fmin z15.s, p5/M, z15.s, z25.s\n"
+ "fmin z16.s, p5/M, z16.s, z25.s\n"
+ "fmin z17.s, p5/M, z17.s, z25.s\n"
+ "fmin z18.s, p5/M, z18.s, z25.s\n"
+ "fmin z19.s, p5/M, z19.s, z25.s\n"
+ "fmin z20.s, p5/M, z20.s, z25.s\n"
+ "fmin z21.s, p5/M, z21.s, z25.s\n"
+ "fmin z22.s, p5/M, z22.s, z25.s\n"
+ "fmin z23.s, p5/M, z23.s, z25.s\n"
+ "fmax z8.s, p5/M, z8.s, z24.s\n"
+ "fmax z9.s, p5/M, z9.s, z24.s\n"
+ "fmax z10.s, p5/M, z10.s, z24.s\n"
+ "fmax z11.s, p5/M, z11.s, z24.s\n"
+ "fmax z12.s, p5/M, z12.s, z24.s\n"
+ "fmax z13.s, p5/M, z13.s, z24.s\n"
+ "fmax z14.s, p5/M, z14.s, z24.s\n"
+ "fmax z15.s, p5/M, z15.s, z24.s\n"
+ "fmax z16.s, p5/M, z16.s, z24.s\n"
+ "fmax z17.s, p5/M, z17.s, z24.s\n"
+ "fmax z18.s, p5/M, z18.s, z24.s\n"
+ "fmax z19.s, p5/M, z19.s, z24.s\n"
+ "fmax z20.s, p5/M, z20.s, z24.s\n"
+ "fmax z21.s, p5/M, z21.s, z24.s\n"
+ "fmax z22.s, p5/M, z22.s, z24.s\n"
+ "fmax z23.s, p5/M, z23.s, z24.s\n"
"51:" // Height 4: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1195,30 +1195,30 @@ void sve_hybrid_fp32_mla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x22]\n"
- "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 57f\n"
"56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1246,15 +1246,15 @@ void sve_hybrid_fp32_mla_6x4VL (
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1265,124 +1265,124 @@ void sve_hybrid_fp32_mla_6x4VL (
"b 60f\n"
"59:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"60:" // Height 5: input setup done
"cmp x27, #0x4\n"
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "ld1rqw { z3.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1rqw { z1.s }, p0/Z, [x23]\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1rqw { z0.s }, p0/Z, [x22]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z29.s, z4.s[0]\n"
+ "fmla z12.s, z29.s, z3.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.s, z29.s, z2.s[0]\n"
+ "fmla z20.s, z29.s, z1.s[0]\n"
"add x25, x25, #0x10\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z24.s, z29.s, z0.s[0]\n"
+ "fmla z9.s, z28.s, z4.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
"add x24, x24, #0x10\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z13.s, z28.s, z3.s[0]\n"
+ "fmla z17.s, z28.s, z2.s[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z21.s, z28.s, z1.s[0]\n"
+ "fmla z25.s, z28.s, z0.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z29.s, z4.s[0]\n"
+ "fmla z14.s, z29.s, z3.s[0]\n"
+ "fmla z18.s, z29.s, z2.s[0]\n"
+ "fmla z22.s, z29.s, z1.s[0]\n"
+ "fmla z26.s, z29.s, z0.s[0]\n"
+ "fmla z11.s, z28.s, z4.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[0]\n"
+ "fmla z19.s, z28.s, z2.s[0]\n"
+ "fmla z23.s, z28.s, z1.s[0]\n"
+ "fmla z27.s, z28.s, z0.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.s, z29.s, z4.s[1]\n"
+ "fmla z12.s, z29.s, z3.s[1]\n"
+ "fmla z16.s, z29.s, z2.s[1]\n"
+ "fmla z20.s, z29.s, z1.s[1]\n"
+ "fmla z24.s, z29.s, z0.s[1]\n"
+ "fmla z9.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z13.s, z28.s, z3.s[1]\n"
+ "fmla z17.s, z28.s, z2.s[1]\n"
+ "fmla z21.s, z28.s, z1.s[1]\n"
+ "fmla z25.s, z28.s, z0.s[1]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z10.s, z29.s, z4.s[1]\n"
+ "fmla z14.s, z29.s, z3.s[1]\n"
+ "fmla z18.s, z29.s, z2.s[1]\n"
+ "fmla z22.s, z29.s, z1.s[1]\n"
+ "fmla z26.s, z29.s, z0.s[1]\n"
+ "fmla z11.s, z28.s, z4.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[1]\n"
+ "fmla z19.s, z28.s, z2.s[1]\n"
+ "fmla z23.s, z28.s, z1.s[1]\n"
+ "fmla z27.s, z28.s, z0.s[1]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.s, z29.s, z4.s[2]\n"
+ "fmla z12.s, z29.s, z3.s[2]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z24.s, z29.s, z0.s[2]\n"
+ "fmla z9.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z13.s, z28.s, z3.s[2]\n"
+ "fmla z17.s, z28.s, z2.s[2]\n"
+ "fmla z21.s, z28.s, z1.s[2]\n"
+ "fmla z25.s, z28.s, z0.s[2]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.s, z29.s, z4.s[2]\n"
+ "fmla z14.s, z29.s, z3.s[2]\n"
+ "fmla z18.s, z29.s, z2.s[2]\n"
+ "fmla z22.s, z29.s, z1.s[2]\n"
+ "fmla z26.s, z29.s, z0.s[2]\n"
+ "fmla z11.s, z28.s, z4.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z15.s, z28.s, z3.s[2]\n"
+ "fmla z19.s, z28.s, z2.s[2]\n"
+ "fmla z23.s, z28.s, z1.s[2]\n"
+ "fmla z27.s, z28.s, z0.s[2]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.s, z29.s, z4.s[3]\n"
+ "fmla z12.s, z29.s, z3.s[3]\n"
+ "fmla z16.s, z29.s, z2.s[3]\n"
+ "fmla z20.s, z29.s, z1.s[3]\n"
+ "fmla z24.s, z29.s, z0.s[3]\n"
+ "fmla z9.s, z28.s, z4.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z13.s, z28.s, z3.s[3]\n"
+ "fmla z17.s, z28.s, z2.s[3]\n"
+ "fmla z21.s, z28.s, z1.s[3]\n"
+ "fmla z25.s, z28.s, z0.s[3]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.s, z29.s, z4.s[3]\n"
+ "fmla z14.s, z29.s, z3.s[3]\n"
+ "fmla z18.s, z29.s, z2.s[3]\n"
+ "fmla z22.s, z29.s, z1.s[3]\n"
+ "fmla z26.s, z29.s, z0.s[3]\n"
+ "fmla z11.s, z28.s, z4.s[3]\n"
+ "fmla z15.s, z28.s, z3.s[3]\n"
+ "fmla z19.s, z28.s, z2.s[3]\n"
+ "fmla z23.s, z28.s, z1.s[3]\n"
+ "fmla z27.s, z28.s, z0.s[3]\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -1392,111 +1392,111 @@ void sve_hybrid_fp32_mla_6x4VL (
"ld1rqw { z2.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
+ "fmla z8.s, z29.s, z0.s[0]\n"
+ "fmla z12.s, z29.s, z1.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z16.s, z29.s, z2.s[0]\n"
+ "fmla z20.s, z29.s, z3.s[0]\n"
+ "fmla z24.s, z29.s, z4.s[0]\n"
+ "fmla z9.s, z28.s, z0.s[0]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z2.s[0]\n"
+ "fmla z21.s, z28.s, z3.s[0]\n"
+ "fmla z25.s, z28.s, z4.s[0]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
+ "fmla z10.s, z29.s, z0.s[0]\n"
+ "fmla z14.s, z29.s, z1.s[0]\n"
+ "fmla z18.s, z29.s, z2.s[0]\n"
+ "fmla z22.s, z29.s, z3.s[0]\n"
+ "fmla z26.s, z29.s, z4.s[0]\n"
+ "fmla z11.s, z28.s, z0.s[0]\n"
+ "fmla z15.s, z28.s, z1.s[0]\n"
+ "fmla z19.s, z28.s, z2.s[0]\n"
+ "fmla z23.s, z28.s, z3.s[0]\n"
+ "fmla z27.s, z28.s, z4.s[0]\n"
"ble 63f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z29.s, z0.s[1]\n"
+ "fmla z12.s, z29.s, z1.s[1]\n"
+ "fmla z16.s, z29.s, z2.s[1]\n"
+ "fmla z20.s, z29.s, z3.s[1]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.s, z29.s, z4.s[1]\n"
+ "fmla z9.s, z28.s, z0.s[1]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z28.s, z1.s[1]\n"
+ "fmla z17.s, z28.s, z2.s[1]\n"
+ "fmla z21.s, z28.s, z3.s[1]\n"
+ "fmla z25.s, z28.s, z4.s[1]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
+ "fmla z10.s, z29.s, z0.s[1]\n"
+ "fmla z14.s, z29.s, z1.s[1]\n"
+ "fmla z18.s, z29.s, z2.s[1]\n"
+ "fmla z22.s, z29.s, z3.s[1]\n"
+ "fmla z26.s, z29.s, z4.s[1]\n"
+ "fmla z11.s, z28.s, z0.s[1]\n"
+ "fmla z15.s, z28.s, z1.s[1]\n"
+ "fmla z19.s, z28.s, z2.s[1]\n"
+ "fmla z23.s, z28.s, z3.s[1]\n"
+ "fmla z27.s, z28.s, z4.s[1]\n"
"ble 63f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z29.s, z0.s[2]\n"
+ "fmla z12.s, z29.s, z1.s[2]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z20.s, z29.s, z3.s[2]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.s, z29.s, z4.s[2]\n"
+ "fmla z9.s, z28.s, z0.s[2]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z28.s, z1.s[2]\n"
+ "fmla z17.s, z28.s, z2.s[2]\n"
+ "fmla z21.s, z28.s, z3.s[2]\n"
+ "fmla z25.s, z28.s, z4.s[2]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
+ "fmla z10.s, z29.s, z0.s[2]\n"
+ "fmla z14.s, z29.s, z1.s[2]\n"
+ "fmla z18.s, z29.s, z2.s[2]\n"
+ "fmla z22.s, z29.s, z3.s[2]\n"
+ "fmla z26.s, z29.s, z4.s[2]\n"
+ "fmla z11.s, z28.s, z0.s[2]\n"
+ "fmla z15.s, z28.s, z1.s[2]\n"
+ "fmla z19.s, z28.s, z2.s[2]\n"
+ "fmla z23.s, z28.s, z3.s[2]\n"
+ "fmla z27.s, z28.s, z4.s[2]\n"
"ble 63f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z29.s }, p5/Z, [x10]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z29.s, z0.s[3]\n"
+ "fmla z12.s, z29.s, z1.s[3]\n"
+ "fmla z16.s, z29.s, z2.s[3]\n"
+ "fmla z20.s, z29.s, z3.s[3]\n"
+ "fmla z24.s, z29.s, z4.s[3]\n"
+ "fmla z9.s, z28.s, z0.s[3]\n"
+ "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z13.s, z28.s, z1.s[3]\n"
+ "fmla z17.s, z28.s, z2.s[3]\n"
+ "fmla z21.s, z28.s, z3.s[3]\n"
+ "fmla z25.s, z28.s, z4.s[3]\n"
+ "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z10.s, z29.s, z0.s[3]\n"
+ "fmla z14.s, z29.s, z1.s[3]\n"
+ "fmla z18.s, z29.s, z2.s[3]\n"
+ "fmla z22.s, z29.s, z3.s[3]\n"
+ "fmla z26.s, z29.s, z4.s[3]\n"
+ "fmla z11.s, z28.s, z0.s[3]\n"
+ "fmla z15.s, z28.s, z1.s[3]\n"
+ "fmla z19.s, z28.s, z2.s[3]\n"
+ "fmla z23.s, z28.s, z3.s[3]\n"
+ "fmla z27.s, z28.s, z4.s[3]\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1509,49 +1509,49 @@ void sve_hybrid_fp32_mla_6x4VL (
"add x22, x23, x20, LSL #2\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z29.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z23.s, p5/M, z23.s, z1.s\n"
- "fmin z24.s, p5/M, z24.s, z1.s\n"
- "fmin z25.s, p5/M, z25.s, z1.s\n"
- "fmin z26.s, p5/M, z26.s, z1.s\n"
- "fmin z27.s, p5/M, z27.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z23.s, p5/M, z23.s, z0.s\n"
- "fmax z24.s, p5/M, z24.s, z0.s\n"
- "fmax z25.s, p5/M, z25.s, z0.s\n"
- "fmax z26.s, p5/M, z26.s, z0.s\n"
- "fmax z27.s, p5/M, z27.s, z0.s\n"
+ "ld1rw { z28.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z29.s\n"
+ "fmin z9.s, p5/M, z9.s, z29.s\n"
+ "fmin z10.s, p5/M, z10.s, z29.s\n"
+ "fmin z11.s, p5/M, z11.s, z29.s\n"
+ "fmin z12.s, p5/M, z12.s, z29.s\n"
+ "fmin z13.s, p5/M, z13.s, z29.s\n"
+ "fmin z14.s, p5/M, z14.s, z29.s\n"
+ "fmin z15.s, p5/M, z15.s, z29.s\n"
+ "fmin z16.s, p5/M, z16.s, z29.s\n"
+ "fmin z17.s, p5/M, z17.s, z29.s\n"
+ "fmin z18.s, p5/M, z18.s, z29.s\n"
+ "fmin z19.s, p5/M, z19.s, z29.s\n"
+ "fmin z20.s, p5/M, z20.s, z29.s\n"
+ "fmin z21.s, p5/M, z21.s, z29.s\n"
+ "fmin z22.s, p5/M, z22.s, z29.s\n"
+ "fmin z23.s, p5/M, z23.s, z29.s\n"
+ "fmin z24.s, p5/M, z24.s, z29.s\n"
+ "fmin z25.s, p5/M, z25.s, z29.s\n"
+ "fmin z26.s, p5/M, z26.s, z29.s\n"
+ "fmin z27.s, p5/M, z27.s, z29.s\n"
+ "fmax z8.s, p5/M, z8.s, z28.s\n"
+ "fmax z9.s, p5/M, z9.s, z28.s\n"
+ "fmax z10.s, p5/M, z10.s, z28.s\n"
+ "fmax z11.s, p5/M, z11.s, z28.s\n"
+ "fmax z12.s, p5/M, z12.s, z28.s\n"
+ "fmax z13.s, p5/M, z13.s, z28.s\n"
+ "fmax z14.s, p5/M, z14.s, z28.s\n"
+ "fmax z15.s, p5/M, z15.s, z28.s\n"
+ "fmax z16.s, p5/M, z16.s, z28.s\n"
+ "fmax z17.s, p5/M, z17.s, z28.s\n"
+ "fmax z18.s, p5/M, z18.s, z28.s\n"
+ "fmax z19.s, p5/M, z19.s, z28.s\n"
+ "fmax z20.s, p5/M, z20.s, z28.s\n"
+ "fmax z21.s, p5/M, z21.s, z28.s\n"
+ "fmax z22.s, p5/M, z22.s, z28.s\n"
+ "fmax z23.s, p5/M, z23.s, z28.s\n"
+ "fmax z24.s, p5/M, z24.s, z28.s\n"
+ "fmax z25.s, p5/M, z25.s, z28.s\n"
+ "fmax z26.s, p5/M, z26.s, z28.s\n"
+ "fmax z27.s, p5/M, z27.s, z28.s\n"
"64:" // Height 5: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1626,35 +1626,35 @@ void sve_hybrid_fp32_mla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x24]\n"
- "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x22]\n"
- "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 70f\n"
"69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1686,16 +1686,16 @@ void sve_hybrid_fp32_mla_6x4VL (
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 73f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1707,143 +1707,143 @@ void sve_hybrid_fp32_mla_6x4VL (
"b 73f\n"
"72:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"73:" // Height 6: input setup done
"cmp x27, #0x4\n"
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
+ "ld1rqw { z7.s }, p0/Z, [x26]\n"
+ "ld1rqw { z6.s }, p0/Z, [x25]\n"
"sub x27, x27, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z5.s }, p0/Z, [x21]\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "ld1rqw { z2.s }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z1.s }, p5/Z, [x10]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[0]\n"
+ "fmla z12.s, z1.s, z6.s[0]\n"
+ "fmla z16.s, z1.s, z5.s[0]\n"
+ "fmla z20.s, z1.s, z4.s[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z28.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z24.s, z1.s, z3.s[0]\n"
+ "fmla z28.s, z1.s, z2.s[0]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
"add x21, x21, #0x10\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "fmla z29.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z30.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
- "fmla z31.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z28.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "fmla z29.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[0]\n"
+ "fmla z13.s, z0.s, z6.s[0]\n"
+ "fmla z17.s, z0.s, z5.s[0]\n"
+ "fmla z21.s, z0.s, z4.s[0]\n"
+ "fmla z25.s, z0.s, z3.s[0]\n"
+ "fmla z29.s, z0.s, z2.s[0]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z10.s, z1.s, z7.s[0]\n"
+ "fmla z14.s, z1.s, z6.s[0]\n"
+ "fmla z18.s, z1.s, z5.s[0]\n"
+ "fmla z22.s, z1.s, z4.s[0]\n"
+ "fmla z26.s, z1.s, z3.s[0]\n"
+ "fmla z30.s, z1.s, z2.s[0]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "fmla z11.s, z0.s, z7.s[0]\n"
+ "fmla z15.s, z0.s, z6.s[0]\n"
+ "fmla z19.s, z0.s, z5.s[0]\n"
+ "fmla z23.s, z0.s, z4.s[0]\n"
+ "fmla z27.s, z0.s, z3.s[0]\n"
+ "fmla z31.s, z0.s, z2.s[0]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #5, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[1]\n"
+ "fmla z12.s, z1.s, z6.s[1]\n"
+ "fmla z16.s, z1.s, z5.s[1]\n"
+ "fmla z20.s, z1.s, z4.s[1]\n"
+ "fmla z24.s, z1.s, z3.s[1]\n"
+ "fmla z28.s, z1.s, z2.s[1]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #6, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[1]\n"
+ "fmla z13.s, z0.s, z6.s[1]\n"
+ "fmla z17.s, z0.s, z5.s[1]\n"
+ "fmla z21.s, z0.s, z4.s[1]\n"
+ "fmla z25.s, z0.s, z3.s[1]\n"
+ "fmla z29.s, z0.s, z2.s[1]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z30.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
- "fmla z31.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z28.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "fmla z29.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z30.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
- "fmla z31.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z28.s, z6.s, z5.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "fmla z29.s, z7.s, z5.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z30.s, z6.s, z5.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
- "fmla z31.s, z7.s, z5.s[3]\n"
+ "fmla z10.s, z1.s, z7.s[1]\n"
+ "fmla z14.s, z1.s, z6.s[1]\n"
+ "fmla z18.s, z1.s, z5.s[1]\n"
+ "fmla z22.s, z1.s, z4.s[1]\n"
+ "fmla z26.s, z1.s, z3.s[1]\n"
+ "fmla z30.s, z1.s, z2.s[1]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #-8, MUL VL]\n"
+ "fmla z11.s, z0.s, z7.s[1]\n"
+ "fmla z15.s, z0.s, z6.s[1]\n"
+ "fmla z19.s, z0.s, z5.s[1]\n"
+ "fmla z23.s, z0.s, z4.s[1]\n"
+ "fmla z27.s, z0.s, z3.s[1]\n"
+ "fmla z31.s, z0.s, z2.s[1]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #-7, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[2]\n"
+ "fmla z12.s, z1.s, z6.s[2]\n"
+ "fmla z16.s, z1.s, z5.s[2]\n"
+ "fmla z20.s, z1.s, z4.s[2]\n"
+ "fmla z24.s, z1.s, z3.s[2]\n"
+ "fmla z28.s, z1.s, z2.s[2]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #-6, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[2]\n"
+ "fmla z13.s, z0.s, z6.s[2]\n"
+ "fmla z17.s, z0.s, z5.s[2]\n"
+ "fmla z21.s, z0.s, z4.s[2]\n"
+ "fmla z25.s, z0.s, z3.s[2]\n"
+ "fmla z29.s, z0.s, z2.s[2]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #-5, MUL VL]\n"
+ "fmla z10.s, z1.s, z7.s[2]\n"
+ "fmla z14.s, z1.s, z6.s[2]\n"
+ "fmla z18.s, z1.s, z5.s[2]\n"
+ "fmla z22.s, z1.s, z4.s[2]\n"
+ "fmla z26.s, z1.s, z3.s[2]\n"
+ "fmla z30.s, z1.s, z2.s[2]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #-4, MUL VL]\n"
+ "fmla z11.s, z0.s, z7.s[2]\n"
+ "fmla z15.s, z0.s, z6.s[2]\n"
+ "fmla z19.s, z0.s, z5.s[2]\n"
+ "fmla z23.s, z0.s, z4.s[2]\n"
+ "fmla z27.s, z0.s, z3.s[2]\n"
+ "fmla z31.s, z0.s, z2.s[2]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #-3, MUL VL]\n"
+ "fmla z8.s, z1.s, z7.s[3]\n"
+ "fmla z12.s, z1.s, z6.s[3]\n"
+ "fmla z16.s, z1.s, z5.s[3]\n"
+ "fmla z20.s, z1.s, z4.s[3]\n"
+ "fmla z24.s, z1.s, z3.s[3]\n"
+ "fmla z28.s, z1.s, z2.s[3]\n"
+ "ld1w { z1.s }, p5/Z, [x10, #-2, MUL VL]\n"
+ "fmla z9.s, z0.s, z7.s[3]\n"
+ "fmla z13.s, z0.s, z6.s[3]\n"
+ "fmla z17.s, z0.s, z5.s[3]\n"
+ "fmla z21.s, z0.s, z4.s[3]\n"
+ "fmla z25.s, z0.s, z3.s[3]\n"
+ "fmla z29.s, z0.s, z2.s[3]\n"
+ "ld1w { z0.s }, p5/Z, [x10, #-1, MUL VL]\n"
+ "fmla z10.s, z1.s, z7.s[3]\n"
+ "fmla z14.s, z1.s, z6.s[3]\n"
+ "fmla z18.s, z1.s, z5.s[3]\n"
+ "fmla z22.s, z1.s, z4.s[3]\n"
+ "fmla z26.s, z1.s, z3.s[3]\n"
+ "fmla z30.s, z1.s, z2.s[3]\n"
+ "fmla z11.s, z0.s, z7.s[3]\n"
+ "fmla z15.s, z0.s, z6.s[3]\n"
+ "fmla z19.s, z0.s, z5.s[3]\n"
+ "fmla z23.s, z0.s, z4.s[3]\n"
+ "fmla z27.s, z0.s, z3.s[3]\n"
+ "fmla z31.s, z0.s, z2.s[3]\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
@@ -1854,127 +1854,127 @@ void sve_hybrid_fp32_mla_6x4VL (
"ld1rqw { z3.s }, p0/Z, [x23]\n"
"ld1rqw { z4.s }, p0/Z, [x22]\n"
"ld1rqw { z5.s }, p0/Z, [x21]\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[0]\n"
- "fmla z12.s, z6.s, z1.s[0]\n"
- "fmla z16.s, z6.s, z2.s[0]\n"
- "fmla z20.s, z6.s, z3.s[0]\n"
- "fmla z24.s, z6.s, z4.s[0]\n"
- "fmla z28.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[0]\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "fmla z17.s, z7.s, z2.s[0]\n"
- "fmla z21.s, z7.s, z3.s[0]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
- "fmla z29.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z7.s, z0.s[0]\n"
+ "fmla z12.s, z7.s, z1.s[0]\n"
+ "fmla z16.s, z7.s, z2.s[0]\n"
+ "fmla z20.s, z7.s, z3.s[0]\n"
+ "fmla z24.s, z7.s, z4.s[0]\n"
+ "fmla z28.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[0]\n"
+ "fmla z13.s, z6.s, z1.s[0]\n"
+ "fmla z17.s, z6.s, z2.s[0]\n"
+ "fmla z21.s, z6.s, z3.s[0]\n"
+ "fmla z25.s, z6.s, z4.s[0]\n"
+ "fmla z29.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[0]\n"
- "fmla z14.s, z6.s, z1.s[0]\n"
- "fmla z18.s, z6.s, z2.s[0]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z26.s, z6.s, z4.s[0]\n"
- "fmla z30.s, z6.s, z5.s[0]\n"
- "fmla z11.s, z7.s, z0.s[0]\n"
- "fmla z15.s, z7.s, z1.s[0]\n"
- "fmla z19.s, z7.s, z2.s[0]\n"
- "fmla z23.s, z7.s, z3.s[0]\n"
- "fmla z27.s, z7.s, z4.s[0]\n"
- "fmla z31.s, z7.s, z5.s[0]\n"
+ "fmla z10.s, z7.s, z0.s[0]\n"
+ "fmla z14.s, z7.s, z1.s[0]\n"
+ "fmla z18.s, z7.s, z2.s[0]\n"
+ "fmla z22.s, z7.s, z3.s[0]\n"
+ "fmla z26.s, z7.s, z4.s[0]\n"
+ "fmla z30.s, z7.s, z5.s[0]\n"
+ "fmla z11.s, z6.s, z0.s[0]\n"
+ "fmla z15.s, z6.s, z1.s[0]\n"
+ "fmla z19.s, z6.s, z2.s[0]\n"
+ "fmla z23.s, z6.s, z3.s[0]\n"
+ "fmla z27.s, z6.s, z4.s[0]\n"
+ "fmla z31.s, z6.s, z5.s[0]\n"
"ble 76f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[1]\n"
- "fmla z12.s, z6.s, z1.s[1]\n"
- "fmla z16.s, z6.s, z2.s[1]\n"
- "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z7.s, z0.s[1]\n"
+ "fmla z12.s, z7.s, z1.s[1]\n"
+ "fmla z16.s, z7.s, z2.s[1]\n"
+ "fmla z20.s, z7.s, z3.s[1]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.s, z6.s, z4.s[1]\n"
- "fmla z28.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[1]\n"
- "fmla z13.s, z7.s, z1.s[1]\n"
- "fmla z17.s, z7.s, z2.s[1]\n"
- "fmla z21.s, z7.s, z3.s[1]\n"
- "fmla z25.s, z7.s, z4.s[1]\n"
- "fmla z29.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.s, z7.s, z4.s[1]\n"
+ "fmla z28.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[1]\n"
+ "fmla z13.s, z6.s, z1.s[1]\n"
+ "fmla z17.s, z6.s, z2.s[1]\n"
+ "fmla z21.s, z6.s, z3.s[1]\n"
+ "fmla z25.s, z6.s, z4.s[1]\n"
+ "fmla z29.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[1]\n"
- "fmla z14.s, z6.s, z1.s[1]\n"
- "fmla z18.s, z6.s, z2.s[1]\n"
- "fmla z22.s, z6.s, z3.s[1]\n"
- "fmla z26.s, z6.s, z4.s[1]\n"
- "fmla z30.s, z6.s, z5.s[1]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z15.s, z7.s, z1.s[1]\n"
- "fmla z19.s, z7.s, z2.s[1]\n"
- "fmla z23.s, z7.s, z3.s[1]\n"
- "fmla z27.s, z7.s, z4.s[1]\n"
- "fmla z31.s, z7.s, z5.s[1]\n"
+ "fmla z10.s, z7.s, z0.s[1]\n"
+ "fmla z14.s, z7.s, z1.s[1]\n"
+ "fmla z18.s, z7.s, z2.s[1]\n"
+ "fmla z22.s, z7.s, z3.s[1]\n"
+ "fmla z26.s, z7.s, z4.s[1]\n"
+ "fmla z30.s, z7.s, z5.s[1]\n"
+ "fmla z11.s, z6.s, z0.s[1]\n"
+ "fmla z15.s, z6.s, z1.s[1]\n"
+ "fmla z19.s, z6.s, z2.s[1]\n"
+ "fmla z23.s, z6.s, z3.s[1]\n"
+ "fmla z27.s, z6.s, z4.s[1]\n"
+ "fmla z31.s, z6.s, z5.s[1]\n"
"ble 76f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[2]\n"
- "fmla z12.s, z6.s, z1.s[2]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z7.s, z0.s[2]\n"
+ "fmla z12.s, z7.s, z1.s[2]\n"
+ "fmla z16.s, z7.s, z2.s[2]\n"
+ "fmla z20.s, z7.s, z3.s[2]\n"
"subs x27, x27, #0x1\n"
- "fmla z24.s, z6.s, z4.s[2]\n"
- "fmla z28.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[2]\n"
- "fmla z13.s, z7.s, z1.s[2]\n"
- "fmla z17.s, z7.s, z2.s[2]\n"
- "fmla z21.s, z7.s, z3.s[2]\n"
- "fmla z25.s, z7.s, z4.s[2]\n"
- "fmla z29.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.s, z7.s, z4.s[2]\n"
+ "fmla z28.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[2]\n"
+ "fmla z13.s, z6.s, z1.s[2]\n"
+ "fmla z17.s, z6.s, z2.s[2]\n"
+ "fmla z21.s, z6.s, z3.s[2]\n"
+ "fmla z25.s, z6.s, z4.s[2]\n"
+ "fmla z29.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[2]\n"
- "fmla z14.s, z6.s, z1.s[2]\n"
- "fmla z18.s, z6.s, z2.s[2]\n"
- "fmla z22.s, z6.s, z3.s[2]\n"
- "fmla z26.s, z6.s, z4.s[2]\n"
- "fmla z30.s, z6.s, z5.s[2]\n"
- "fmla z11.s, z7.s, z0.s[2]\n"
- "fmla z15.s, z7.s, z1.s[2]\n"
- "fmla z19.s, z7.s, z2.s[2]\n"
- "fmla z23.s, z7.s, z3.s[2]\n"
- "fmla z27.s, z7.s, z4.s[2]\n"
- "fmla z31.s, z7.s, z5.s[2]\n"
+ "fmla z10.s, z7.s, z0.s[2]\n"
+ "fmla z14.s, z7.s, z1.s[2]\n"
+ "fmla z18.s, z7.s, z2.s[2]\n"
+ "fmla z22.s, z7.s, z3.s[2]\n"
+ "fmla z26.s, z7.s, z4.s[2]\n"
+ "fmla z30.s, z7.s, z5.s[2]\n"
+ "fmla z11.s, z6.s, z0.s[2]\n"
+ "fmla z15.s, z6.s, z1.s[2]\n"
+ "fmla z19.s, z6.s, z2.s[2]\n"
+ "fmla z23.s, z6.s, z3.s[2]\n"
+ "fmla z27.s, z6.s, z4.s[2]\n"
+ "fmla z31.s, z6.s, z5.s[2]\n"
"ble 76f\n"
- "ld1w { z6.s }, p5/Z, [x10]\n"
- "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
- "fmla z8.s, z6.s, z0.s[3]\n"
- "fmla z12.s, z6.s, z1.s[3]\n"
- "fmla z16.s, z6.s, z2.s[3]\n"
- "fmla z20.s, z6.s, z3.s[3]\n"
- "fmla z24.s, z6.s, z4.s[3]\n"
- "fmla z28.s, z6.s, z5.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
- "fmla z9.s, z7.s, z0.s[3]\n"
- "fmla z13.s, z7.s, z1.s[3]\n"
- "fmla z17.s, z7.s, z2.s[3]\n"
- "fmla z21.s, z7.s, z3.s[3]\n"
- "fmla z25.s, z7.s, z4.s[3]\n"
- "fmla z29.s, z7.s, z5.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "fmla z8.s, z7.s, z0.s[3]\n"
+ "fmla z12.s, z7.s, z1.s[3]\n"
+ "fmla z16.s, z7.s, z2.s[3]\n"
+ "fmla z20.s, z7.s, z3.s[3]\n"
+ "fmla z24.s, z7.s, z4.s[3]\n"
+ "fmla z28.s, z7.s, z5.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "fmla z9.s, z6.s, z0.s[3]\n"
+ "fmla z13.s, z6.s, z1.s[3]\n"
+ "fmla z17.s, z6.s, z2.s[3]\n"
+ "fmla z21.s, z6.s, z3.s[3]\n"
+ "fmla z25.s, z6.s, z4.s[3]\n"
+ "fmla z29.s, z6.s, z5.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "fmla z10.s, z6.s, z0.s[3]\n"
- "fmla z14.s, z6.s, z1.s[3]\n"
- "fmla z18.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[3]\n"
- "fmla z26.s, z6.s, z4.s[3]\n"
- "fmla z30.s, z6.s, z5.s[3]\n"
- "fmla z11.s, z7.s, z0.s[3]\n"
- "fmla z15.s, z7.s, z1.s[3]\n"
- "fmla z19.s, z7.s, z2.s[3]\n"
- "fmla z23.s, z7.s, z3.s[3]\n"
- "fmla z27.s, z7.s, z4.s[3]\n"
- "fmla z31.s, z7.s, z5.s[3]\n"
+ "fmla z10.s, z7.s, z0.s[3]\n"
+ "fmla z14.s, z7.s, z1.s[3]\n"
+ "fmla z18.s, z7.s, z2.s[3]\n"
+ "fmla z22.s, z7.s, z3.s[3]\n"
+ "fmla z26.s, z7.s, z4.s[3]\n"
+ "fmla z30.s, z7.s, z5.s[3]\n"
+ "fmla z11.s, z6.s, z0.s[3]\n"
+ "fmla z15.s, z6.s, z1.s[3]\n"
+ "fmla z19.s, z6.s, z2.s[3]\n"
+ "fmla z23.s, z6.s, z3.s[3]\n"
+ "fmla z27.s, z6.s, z4.s[3]\n"
+ "fmla z31.s, z6.s, z5.s[3]\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -2081,7 +2081,6 @@ void sve_hybrid_fp32_mla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2089,4 +2088,4 @@ void sve_hybrid_fp32_mla_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index c0718b1e75..a353c9d660 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#define ARGLIST \
@@ -89,5 +89,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
index 2ccd050f18..344341205b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
@@ -127,11 +127,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
"cbnz x10, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -143,19 +143,19 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z0.s }, p1/Z, [x28]\n"
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"addvl x12, x12, #1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"addvl x12, x12, #1\n"
"bne 6b\n"
"tbz %x[flags], #1, 11f\n"
@@ -189,9 +189,9 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"15:" // Height 2: no bias
"tbz %x[flags], #0, 16f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
+ "add x20, x11, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"b 17f\n"
"16:" // Height 2: no accumulate
"mov z24.b, #0x0\n"
@@ -201,12 +201,12 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
"cbnz x10, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -214,30 +214,30 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
"20:" // Height 2: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"ble 22f\n"
"21:" // Height 2: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"bgt 21b\n"
"22:" // Height 2: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
"bne 18b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -277,11 +277,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"27:" // Height 3: no bias
"tbz %x[flags], #0, 28f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x21, x11, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
- "ld1w { z26.s }, p0/Z, [x26]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"b 29f\n"
"28:" // Height 3: no accumulate
"mov z24.b, #0x0\n"
@@ -292,13 +292,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
"cbnz x10, 32f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -307,8 +307,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 32f\n"
"31:" // Height 3: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
"32:" // Height 3: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -316,14 +316,14 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z2.s }, p1/Z, [x26]\n"
"ble 34f\n"
"33:" // Height 3: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
- "fmla z26.s, p1/M, z8.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"addvl x12, x12, #1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
@@ -331,13 +331,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, p1/M, z9.s, z2.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x27, x11, x20, LSL #2\n"
@@ -381,13 +381,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"39:" // Height 4: no bias
"tbz %x[flags], #0, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x22, x11, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
- "ld1w { z26.s }, p0/Z, [x26]\n"
- "ld1w { z27.s }, p0/Z, [x25]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
"b 41f\n"
"40:" // Height 4: no accumulate
"mov z24.b, #0x0\n"
@@ -399,14 +399,14 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"42:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 43f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
"cbnz x10, 44f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -416,9 +416,9 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 44f\n"
"43:" // Height 4: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"44:" // Height 4: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -427,16 +427,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z3.s }, p1/Z, [x25]\n"
"ble 46f\n"
"45:" // Height 4: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
- "fmla z26.s, p1/M, z8.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
- "fmla z27.s, p1/M, z8.s, z3.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
"addvl x12, x12, #1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
@@ -445,14 +445,14 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 45b\n"
"46:" // Height 4: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, p1/M, z9.s, z2.s\n"
- "fmla z27.s, p1/M, z9.s, z3.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
"bne 42b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x27, x11, x20, LSL #2\n"
@@ -501,15 +501,15 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"51:" // Height 5: no bias
"tbz %x[flags], #0, 52f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x23, x11, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
- "ld1w { z26.s }, p0/Z, [x26]\n"
- "ld1w { z27.s }, p0/Z, [x25]\n"
- "ld1w { z28.s }, p0/Z, [x24]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x23]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
"b 53f\n"
"52:" // Height 5: no accumulate
"mov z24.b, #0x0\n"
@@ -522,15 +522,15 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"54:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 55f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
"cbnz x10, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -541,10 +541,10 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 56f\n"
"55:" // Height 5: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"56:" // Height 5: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -554,20 +554,20 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z4.s }, p1/Z, [x24]\n"
"ble 58f\n"
"57:" // Height 5: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
- "fmla z26.s, p1/M, z8.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
- "fmla z27.s, p1/M, z8.s, z3.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, p1/M, z8.s, z4.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
@@ -575,15 +575,15 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 57b\n"
"58:" // Height 5: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, p1/M, z9.s, z2.s\n"
- "fmla z27.s, p1/M, z9.s, z3.s\n"
- "fmla z28.s, p1/M, z9.s, z4.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"bne 54b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x27, x11, x20, LSL #2\n"
@@ -636,18 +636,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 65f\n"
"63:" // Height 6: no bias
"tbz %x[flags], #0, 64f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x11, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
- "ld1w { z26.s }, p0/Z, [x26]\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x25]\n"
- "ld1w { z28.s }, p0/Z, [x24]\n"
- "ld1w { z29.s }, p0/Z, [x23]\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x23]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x22]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
"b 65f\n"
"64:" // Height 6: no accumulate
"mov z24.b, #0x0\n"
@@ -661,16 +661,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"66:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 67f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
"cbnz x10, 68f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -682,11 +682,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 68f\n"
"67:" // Height 6: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"68:" // Height 6: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -697,21 +697,21 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z5.s }, p1/Z, [x23]\n"
"ble 70f\n"
"69:" // Height 6: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
- "fmla z26.s, p1/M, z8.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
- "fmla z27.s, p1/M, z8.s, z3.s\n"
- "fmla z28.s, p1/M, z8.s, z4.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"add x23, x23, #0x4\n"
"addvl x12, x12, #1\n"
- "fmla z29.s, p1/M, z8.s, z5.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
@@ -721,16 +721,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 69b\n"
"70:" // Height 6: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, p1/M, z9.s, z2.s\n"
- "fmla z27.s, p1/M, z9.s, z3.s\n"
- "fmla z28.s, p1/M, z9.s, z4.s\n"
- "fmla z29.s, p1/M, z9.s, z5.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"bne 66b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x27, x11, x20, LSL #2\n"
@@ -788,20 +788,20 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 77f\n"
"75:" // Height 7: no bias
"tbz %x[flags], #0, 76f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x11, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
- "ld1w { z26.s }, p0/Z, [x26]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x25]\n"
- "ld1w { z28.s }, p0/Z, [x24]\n"
- "ld1w { z29.s }, p0/Z, [x23]\n"
- "ld1w { z30.s }, p0/Z, [x22]\n"
+ "add x23, x20, x24, LSL #2\n"
+ "add x22, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x21, x22, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x23]\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"b 77f\n"
"76:" // Height 7: no accumulate
"mov z24.b, #0x0\n"
@@ -816,17 +816,17 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"78:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 79f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
- "ldr x22, [x21, #0x30]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
"cbnz x10, 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -839,12 +839,12 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 80f\n"
"79:" // Height 7: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"80:" // Height 7: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -856,25 +856,25 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z6.s }, p1/Z, [x22]\n"
"ble 82f\n"
"81:" // Height 7: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
- "fmla z26.s, p1/M, z8.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
- "fmla z27.s, p1/M, z8.s, z3.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"add x23, x23, #0x4\n"
"add x22, x22, #0x4\n"
- "fmla z28.s, p1/M, z8.s, z4.s\n"
- "fmla z29.s, p1/M, z8.s, z5.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"addvl x12, x12, #1\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
- "fmla z30.s, p1/M, z8.s, z6.s\n"
+ "fmla z30.s, p1/M, z16.s, z6.s\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
"ld1rw { z4.s }, p1/Z, [x24]\n"
@@ -883,17 +883,17 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 81b\n"
"82:" // Height 7: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, p1/M, z9.s, z2.s\n"
- "fmla z27.s, p1/M, z9.s, z3.s\n"
- "fmla z28.s, p1/M, z9.s, z4.s\n"
- "fmla z29.s, p1/M, z9.s, z5.s\n"
- "fmla z30.s, p1/M, z9.s, z6.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
+ "fmla z30.s, p1/M, z16.s, z6.s\n"
"bne 78b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x27, x11, x20, LSL #2\n"
@@ -959,22 +959,22 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 89f\n"
"87:" // Height 8: no bias
"tbz %x[flags], #0, 88f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x11, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
"ld1w { z24.s }, p0/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x27]\n"
- "ld1w { z26.s }, p0/Z, [x26]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z27.s }, p0/Z, [x25]\n"
- "ld1w { z28.s }, p0/Z, [x24]\n"
- "add x21, x22, x20, LSL #2\n"
- "ld1w { z29.s }, p0/Z, [x23]\n"
- "ld1w { z30.s }, p0/Z, [x22]\n"
- "ld1w { z31.s }, p0/Z, [x21]\n"
+ "add x23, x21, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x23]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
"b 89f\n"
"88:" // Height 8: no accumulate
"mov z24.b, #0x0\n"
@@ -990,18 +990,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"90:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 91f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
- "ldr x22, [x21, #0x30]\n"
- "ldr x21, [x21, #0x38]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x38]\n"
"cbnz x10, 92f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -1015,13 +1015,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"b 92f\n"
"91:" // Height 8: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"92:" // Height 8: input setup done
"subs x9, x9, #0x1\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -1034,27 +1034,27 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"ld1rw { z7.s }, p1/Z, [x21]\n"
"ble 94f\n"
"93:" // Height 8: Multiply loop: Main loop
- "ld1w { z8.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x28, x28, #0x4\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, p1/M, z8.s, z0.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
"add x27, x27, #0x4\n"
"add x26, x26, #0x4\n"
- "fmla z25.s, p1/M, z8.s, z1.s\n"
- "fmla z26.s, p1/M, z8.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
"add x25, x25, #0x4\n"
"add x24, x24, #0x4\n"
- "fmla z27.s, p1/M, z8.s, z3.s\n"
- "fmla z28.s, p1/M, z8.s, z4.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
"add x23, x23, #0x4\n"
"add x22, x22, #0x4\n"
- "fmla z29.s, p1/M, z8.s, z5.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
"ld1rw { z0.s }, p1/Z, [x28]\n"
"add x21, x21, #0x4\n"
"addvl x12, x12, #1\n"
"ld1rw { z1.s }, p1/Z, [x27]\n"
- "fmla z30.s, p1/M, z8.s, z6.s\n"
- "fmla z31.s, p1/M, z8.s, z7.s\n"
+ "fmla z30.s, p1/M, z16.s, z6.s\n"
+ "fmla z31.s, p1/M, z16.s, z7.s\n"
"ld1rw { z2.s }, p1/Z, [x26]\n"
"ld1rw { z3.s }, p1/Z, [x25]\n"
"ld1rw { z4.s }, p1/Z, [x24]\n"
@@ -1064,18 +1064,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"bgt 93b\n"
"94:" // Height 8: Multiply loop: Main loop skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "ld1w { z9.s }, p1/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12]\n"
"add x10, x10, #0x1\n"
"cmp x10, x20\n"
- "fmla z24.s, p1/M, z9.s, z0.s\n"
- "fmla z25.s, p1/M, z9.s, z1.s\n"
+ "fmla z24.s, p1/M, z16.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z1.s\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, p1/M, z9.s, z2.s\n"
- "fmla z27.s, p1/M, z9.s, z3.s\n"
- "fmla z28.s, p1/M, z9.s, z4.s\n"
- "fmla z29.s, p1/M, z9.s, z5.s\n"
- "fmla z30.s, p1/M, z9.s, z6.s\n"
- "fmla z31.s, p1/M, z9.s, z7.s\n"
+ "fmla z26.s, p1/M, z16.s, z2.s\n"
+ "fmla z27.s, p1/M, z16.s, z3.s\n"
+ "fmla z28.s, p1/M, z16.s, z4.s\n"
+ "fmla z29.s, p1/M, z16.s, z5.s\n"
+ "fmla z30.s, p1/M, z16.s, z6.s\n"
+ "fmla z31.s, p1/M, z16.s, z7.s\n"
"bne 90b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x27, x11, x20, LSL #2\n"
@@ -1132,12 +1132,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"98:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index 9679d49506..161c85e5f3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -127,11 +127,11 @@ void sve_hybrid_fp32_mla_8x1VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
"cbnz x10, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -144,39 +144,39 @@ void sve_hybrid_fp32_mla_8x1VL (
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"sub x9, x9, #0x4\n"
"cmp x9, #0x4\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
"add x28, x28, #0x10\n"
"addvl x12, x12, #4\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
"ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
"addvl x12, x12, #1\n"
"ble 11f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
"addvl x12, x12, #1\n"
"ble 11f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
"addvl x12, x12, #1\n"
"ble 11f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
"addvl x12, x12, #1\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -214,9 +214,9 @@ void sve_hybrid_fp32_mla_8x1VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
+ "add x20, x11, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"b 18f\n"
"17:" // Height 2: no accumulate
"mov z24.b, #0x0\n"
@@ -226,12 +226,12 @@ void sve_hybrid_fp32_mla_8x1VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
"cbnz x10, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -239,29 +239,29 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
"21:" // Height 2: input setup done
"cmp x9, #0x4\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z1.s[0]\n"
+ "fmla z25.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z24.s, z16.s, z1.s[1]\n"
+ "fmla z25.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "fmla z24.s, z17.s, z1.s[2]\n"
+ "fmla z25.s, z17.s, z0.s[2]\n"
"cmp x9, #0x4\n"
"add x28, x28, #0x10\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z24.s, z16.s, z1.s[3]\n"
+ "fmla z25.s, z16.s, z0.s[3]\n"
"add x27, x27, #0x10\n"
"addvl x12, x12, #4\n"
"bgt 22b\n"
@@ -270,26 +270,26 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
"subs x9, x9, #0x1\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
"addvl x12, x12, #1\n"
"ble 24f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
"addvl x12, x12, #1\n"
"ble 24f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
"addvl x12, x12, #1\n"
"ble 24f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -333,11 +333,11 @@ void sve_hybrid_fp32_mla_8x1VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x21, x11, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
- "ld1w { z26.s }, p1/Z, [x26]\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"b 31f\n"
"30:" // Height 3: no accumulate
"mov z24.b, #0x0\n"
@@ -348,13 +348,13 @@ void sve_hybrid_fp32_mla_8x1VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
"cbnz x10, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -363,38 +363,38 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
"34:" // Height 3: input setup done
"cmp x9, #0x4\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
+ "ld1rqw { z2.s }, p0/Z, [x28]\n"
"ld1rqw { z1.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1rqw { z0.s }, p0/Z, [x26]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z2.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
+ "fmla z26.s, z16.s, z0.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z24.s, z16.s, z2.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"cmp x9, #0x4\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z17.s, z2.s[2]\n"
+ "fmla z25.s, z17.s, z1.s[2]\n"
"add x28, x28, #0x10\n"
"add x27, x27, #0x10\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z26.s, z17.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z2.s[3]\n"
"add x26, x26, #0x10\n"
"addvl x12, x12, #4\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
+ "fmla z26.s, z16.s, z0.s[3]\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
@@ -402,31 +402,31 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z1.s }, p0/Z, [x27]\n"
"subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
"addvl x12, x12, #1\n"
"ble 37f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[1]\n"
"addvl x12, x12, #1\n"
"ble 37f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
"addvl x12, x12, #1\n"
"ble 37f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
@@ -474,13 +474,13 @@ void sve_hybrid_fp32_mla_8x1VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x22, x11, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
- "ld1w { z26.s }, p1/Z, [x26]\n"
- "ld1w { z27.s }, p1/Z, [x25]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"b 44f\n"
"43:" // Height 4: no accumulate
"mov z24.b, #0x0\n"
@@ -492,14 +492,14 @@ void sve_hybrid_fp32_mla_8x1VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
"cbnz x10, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -509,45 +509,45 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"47:" // Height 4: input setup done
"cmp x9, #0x4\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "ld1rqw { z2.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "ld1rqw { z3.s }, p0/Z, [x25]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"cmp x9, #0x4\n"
"add x28, x28, #0x10\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z3.s[0]\n"
+ "fmla z25.s, z16.s, z2.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z26.s, z16.s, z1.s[0]\n"
+ "fmla z27.s, z16.s, z0.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "fmla z24.s, z18.s, z3.s[1]\n"
+ "fmla z25.s, z18.s, z2.s[1]\n"
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z26.s, z18.s, z1.s[1]\n"
+ "fmla z27.s, z18.s, z0.s[1]\n"
"add x25, x25, #0x10\n"
"addvl x12, x12, #4\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z24.s, z17.s, z3.s[2]\n"
+ "fmla z25.s, z17.s, z2.s[2]\n"
+ "fmla z26.s, z17.s, z1.s[2]\n"
+ "fmla z27.s, z17.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z3.s[3]\n"
+ "fmla z25.s, z16.s, z2.s[3]\n"
+ "fmla z26.s, z16.s, z1.s[3]\n"
+ "fmla z27.s, z16.s, z0.s[3]\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
@@ -556,35 +556,35 @@ void sve_hybrid_fp32_mla_8x1VL (
"subs x9, x9, #0x1\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z3.s[0]\n"
"ble 50f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[1]\n"
+ "fmla z27.s, z16.s, z3.s[1]\n"
"addvl x12, x12, #1\n"
"ble 50f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z27.s, z16.s, z3.s[2]\n"
"addvl x12, x12, #1\n"
"ble 50f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
+ "fmla z27.s, z16.s, z3.s[3]\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
@@ -637,15 +637,15 @@ void sve_hybrid_fp32_mla_8x1VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "add x23, x11, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
- "ld1w { z26.s }, p1/Z, [x26]\n"
- "ld1w { z27.s }, p1/Z, [x25]\n"
- "ld1w { z28.s }, p1/Z, [x24]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x23]\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"b 57f\n"
"56:" // Height 5: no accumulate
"mov z24.b, #0x0\n"
@@ -658,15 +658,15 @@ void sve_hybrid_fp32_mla_8x1VL (
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
"cbnz x10, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -677,52 +677,52 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 60f\n"
"59:" // Height 5: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"60:" // Height 5: input setup done
"cmp x9, #0x4\n"
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "ld1rqw { z4.s }, p0/Z, [x28]\n"
+ "ld1rqw { z3.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
"ld1rqw { z2.s }, p0/Z, [x26]\n"
- "ld1rqw { z3.s }, p0/Z, [x25]\n"
+ "ld1rqw { z1.s }, p0/Z, [x25]\n"
"cmp x9, #0x4\n"
"add x28, x28, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x24]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1rqw { z0.s }, p0/Z, [x24]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z4.s[0]\n"
+ "fmla z25.s, z16.s, z3.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z1.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z4.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"add x27, x27, #0x10\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z25.s, z18.s, z3.s[1]\n"
+ "fmla z26.s, z18.s, z2.s[1]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z27.s, z18.s, z1.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[1]\n"
"add x24, x24, #0x10\n"
"addvl x12, x12, #4\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z24.s, z17.s, z4.s[2]\n"
+ "fmla z25.s, z17.s, z3.s[2]\n"
+ "fmla z26.s, z17.s, z2.s[2]\n"
+ "fmla z27.s, z17.s, z1.s[2]\n"
+ "fmla z28.s, z17.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z4.s[3]\n"
+ "fmla z25.s, z16.s, z3.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
+ "fmla z27.s, z16.s, z1.s[3]\n"
+ "fmla z28.s, z16.s, z0.s[3]\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
@@ -732,39 +732,39 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z2.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"ld1rqw { z4.s }, p0/Z, [x24]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z3.s[0]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z28.s, z16.s, z4.s[0]\n"
"ble 63f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[1]\n"
+ "fmla z27.s, z16.s, z3.s[1]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z28.s, z16.s, z4.s[1]\n"
"ble 63f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z27.s, z16.s, z3.s[2]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z28.s, z16.s, z4.s[2]\n"
"ble 63f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
+ "fmla z27.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z4.s[3]\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
@@ -821,18 +821,18 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 70f\n"
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x23, x11, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
- "ld1w { z26.s }, p1/Z, [x26]\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x25]\n"
- "ld1w { z28.s }, p1/Z, [x24]\n"
- "ld1w { z29.s }, p1/Z, [x23]\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x23]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x22]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"b 70f\n"
"69:" // Height 6: no accumulate
"mov z24.b, #0x0\n"
@@ -846,16 +846,16 @@ void sve_hybrid_fp32_mla_8x1VL (
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
"cbnz x10, 73f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -867,59 +867,59 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 73f\n"
"72:" // Height 6: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"73:" // Height 6: input setup done
"cmp x9, #0x4\n"
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "ld1rqw { z5.s }, p0/Z, [x28]\n"
+ "ld1rqw { z4.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "ld1rqw { z3.s }, p0/Z, [x25]\n"
+ "ld1rqw { z3.s }, p0/Z, [x26]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"cmp x9, #0x4\n"
"add x28, x28, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x24]\n"
- "ld1rqw { z5.s }, p0/Z, [x23]\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z19.s, z5.s[0]\n"
+ "fmla z25.s, z19.s, z4.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z26.s, z19.s, z3.s[0]\n"
+ "fmla z27.s, z19.s, z2.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "fmla z28.s, z19.s, z1.s[0]\n"
+ "fmla z29.s, z19.s, z0.s[0]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
+ "fmla z24.s, z18.s, z5.s[1]\n"
+ "fmla z25.s, z18.s, z4.s[1]\n"
"add x23, x23, #0x10\n"
"addvl x12, x12, #4\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
- "fmla z29.s, z9.s, z5.s[1]\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z29.s, z10.s, z5.s[2]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
- "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z26.s, z18.s, z3.s[1]\n"
+ "fmla z27.s, z18.s, z2.s[1]\n"
+ "fmla z28.s, z18.s, z1.s[1]\n"
+ "fmla z29.s, z18.s, z0.s[1]\n"
+ "fmla z24.s, z17.s, z5.s[2]\n"
+ "fmla z25.s, z17.s, z4.s[2]\n"
+ "fmla z26.s, z17.s, z3.s[2]\n"
+ "fmla z27.s, z17.s, z2.s[2]\n"
+ "fmla z28.s, z17.s, z1.s[2]\n"
+ "fmla z29.s, z17.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z5.s[3]\n"
+ "fmla z25.s, z16.s, z4.s[3]\n"
+ "fmla z26.s, z16.s, z3.s[3]\n"
+ "fmla z27.s, z16.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z1.s[3]\n"
+ "fmla z29.s, z16.s, z0.s[3]\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
@@ -930,43 +930,43 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z5.s }, p0/Z, [x23]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z16.s, z4.s[0]\n"
+ "fmla z29.s, z16.s, z5.s[0]\n"
"ble 76f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[1]\n"
+ "fmla z27.s, z16.s, z3.s[1]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
- "fmla z29.s, z9.s, z5.s[1]\n"
+ "fmla z28.s, z16.s, z4.s[1]\n"
+ "fmla z29.s, z16.s, z5.s[1]\n"
"ble 76f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z27.s, z16.s, z3.s[2]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z28.s, z16.s, z4.s[2]\n"
+ "fmla z29.s, z16.s, z5.s[2]\n"
"ble 76f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
- "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
+ "fmla z27.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z4.s[3]\n"
+ "fmla z29.s, z16.s, z5.s[3]\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
@@ -1028,20 +1028,20 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 83f\n"
"81:" // Height 7: no bias
"tbz %x[flags], #0, 82f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x21, x11, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
- "ld1w { z26.s }, p1/Z, [x26]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x25]\n"
- "ld1w { z28.s }, p1/Z, [x24]\n"
- "ld1w { z29.s }, p1/Z, [x23]\n"
- "ld1w { z30.s }, p1/Z, [x22]\n"
+ "add x23, x20, x24, LSL #2\n"
+ "add x22, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x21, x22, x24, LSL #2\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"b 83f\n"
"82:" // Height 7: no accumulate
"mov z24.b, #0x0\n"
@@ -1056,17 +1056,17 @@ void sve_hybrid_fp32_mla_8x1VL (
"84:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 85f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
- "ldr x22, [x21, #0x30]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
"cbnz x10, 86f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -1079,66 +1079,66 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 86f\n"
"85:" // Height 7: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"86:" // Height 7: input setup done
"cmp x9, #0x4\n"
"ble 88f\n"
"87:" // Height 7: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "ld1rqw { z6.s }, p0/Z, [x28]\n"
+ "ld1rqw { z5.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
"ld1rqw { z3.s }, p0/Z, [x25]\n"
"cmp x9, #0x4\n"
"add x28, x28, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x24]\n"
- "ld1rqw { z5.s }, p0/Z, [x23]\n"
+ "ld1rqw { z2.s }, p0/Z, [x24]\n"
+ "ld1rqw { z1.s }, p0/Z, [x23]\n"
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqw { z6.s }, p0/Z, [x22]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "ld1rqw { z0.s }, p0/Z, [x22]\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z19.s, z6.s[0]\n"
+ "fmla z25.s, z19.s, z5.s[0]\n"
+ "fmla z26.s, z19.s, z4.s[0]\n"
+ "fmla z27.s, z19.s, z3.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "fmla z28.s, z19.s, z2.s[0]\n"
+ "fmla z29.s, z19.s, z1.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
"add x25, x25, #0x10\n"
- "fmla z30.s, z8.s, z6.s[0]\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
+ "fmla z30.s, z19.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z6.s[1]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z25.s, z18.s, z5.s[1]\n"
+ "fmla z26.s, z18.s, z4.s[1]\n"
"add x22, x22, #0x10\n"
"addvl x12, x12, #4\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
- "fmla z29.s, z9.s, z5.s[1]\n"
- "fmla z30.s, z9.s, z6.s[1]\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z29.s, z10.s, z5.s[2]\n"
- "fmla z30.s, z10.s, z6.s[2]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
- "fmla z29.s, z11.s, z5.s[3]\n"
- "fmla z30.s, z11.s, z6.s[3]\n"
+ "fmla z27.s, z18.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z2.s[1]\n"
+ "fmla z29.s, z18.s, z1.s[1]\n"
+ "fmla z30.s, z18.s, z0.s[1]\n"
+ "fmla z24.s, z17.s, z6.s[2]\n"
+ "fmla z25.s, z17.s, z5.s[2]\n"
+ "fmla z26.s, z17.s, z4.s[2]\n"
+ "fmla z27.s, z17.s, z3.s[2]\n"
+ "fmla z28.s, z17.s, z2.s[2]\n"
+ "fmla z29.s, z17.s, z1.s[2]\n"
+ "fmla z30.s, z17.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z6.s[3]\n"
+ "fmla z25.s, z16.s, z5.s[3]\n"
+ "fmla z26.s, z16.s, z4.s[3]\n"
+ "fmla z27.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z2.s[3]\n"
+ "fmla z29.s, z16.s, z1.s[3]\n"
+ "fmla z30.s, z16.s, z0.s[3]\n"
"bgt 87b\n"
"88:" // Height 7: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
@@ -1150,47 +1150,47 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z5.s }, p0/Z, [x23]\n"
"ld1rqw { z6.s }, p0/Z, [x22]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z3.s[0]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
- "fmla z30.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z16.s, z4.s[0]\n"
+ "fmla z29.s, z16.s, z5.s[0]\n"
+ "fmla z30.s, z16.s, z6.s[0]\n"
"ble 89f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[1]\n"
+ "fmla z27.s, z16.s, z3.s[1]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
- "fmla z29.s, z9.s, z5.s[1]\n"
- "fmla z30.s, z9.s, z6.s[1]\n"
+ "fmla z28.s, z16.s, z4.s[1]\n"
+ "fmla z29.s, z16.s, z5.s[1]\n"
+ "fmla z30.s, z16.s, z6.s[1]\n"
"ble 89f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z27.s, z16.s, z3.s[2]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z29.s, z10.s, z5.s[2]\n"
- "fmla z30.s, z10.s, z6.s[2]\n"
+ "fmla z28.s, z16.s, z4.s[2]\n"
+ "fmla z29.s, z16.s, z5.s[2]\n"
+ "fmla z30.s, z16.s, z6.s[2]\n"
"ble 89f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
- "fmla z29.s, z11.s, z5.s[3]\n"
- "fmla z30.s, z11.s, z6.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
+ "fmla z27.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z4.s[3]\n"
+ "fmla z29.s, z16.s, z5.s[3]\n"
+ "fmla z30.s, z16.s, z6.s[3]\n"
"89:" // Height 7: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
@@ -1260,22 +1260,22 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 96f\n"
"94:" // Height 8: no bias
"tbz %x[flags], #0, 95f\n"
- "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x27, x11, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
+ "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x11, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x11]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x27]\n"
- "ld1w { z26.s }, p1/Z, [x26]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x25]\n"
- "ld1w { z28.s }, p1/Z, [x24]\n"
- "add x21, x22, x20, LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x23]\n"
- "ld1w { z30.s }, p1/Z, [x22]\n"
- "ld1w { z31.s }, p1/Z, [x21]\n"
+ "add x23, x21, x24, LSL #2\n"
+ "add x20, x23, x24, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x22, x20, x24, LSL #2\n"
+ "add x21, x22, x24, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x21, x24, LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"b 96f\n"
"95:" // Height 8: no accumulate
"mov z24.b, #0x0\n"
@@ -1291,18 +1291,18 @@ void sve_hybrid_fp32_mla_8x1VL (
"97:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w9, [x20, x10, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 98f\n"
- "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x28, [x21, #0x0]\n"
- "ldr x27, [x21, #0x8]\n"
- "ldr x26, [x21, #0x10]\n"
- "ldr x25, [x21, #0x18]\n"
- "ldr x24, [x21, #0x20]\n"
- "ldr x23, [x21, #0x28]\n"
- "ldr x22, [x21, #0x30]\n"
- "ldr x21, [x21, #0x38]\n"
+ "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x28, [x20, #0x0]\n"
+ "ldr x27, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x38]\n"
"cbnz x10, 99f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x28, x28, x20, LSL #2\n"
@@ -1316,73 +1316,73 @@ void sve_hybrid_fp32_mla_8x1VL (
"b 99f\n"
"98:" // Height 8: setup direct input
"mov x28, %x[input_ptr]\n"
- "add x27, x28, x20, LSL #2\n"
- "add x26, x27, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x27, x28, x21, LSL #2\n"
+ "add x26, x27, x21, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"99:" // Height 8: input setup done
"cmp x9, #0x4\n"
"ble 101f\n"
"100:" // Height 8: Multiply loop: Main loop head
"whilelt p0.s, XZR, x9\n"
- "ld1rqw { z0.s }, p0/Z, [x28]\n"
- "ld1rqw { z1.s }, p0/Z, [x27]\n"
+ "ld1rqw { z7.s }, p0/Z, [x28]\n"
+ "ld1rqw { z6.s }, p0/Z, [x27]\n"
"sub x9, x9, #0x4\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "ld1rqw { z3.s }, p0/Z, [x25]\n"
+ "ld1rqw { z5.s }, p0/Z, [x26]\n"
+ "ld1rqw { z4.s }, p0/Z, [x25]\n"
"cmp x9, #0x4\n"
"add x28, x28, #0x10\n"
- "ld1rqw { z4.s }, p0/Z, [x24]\n"
- "ld1rqw { z5.s }, p0/Z, [x23]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
"add x27, x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqw { z6.s }, p0/Z, [x22]\n"
- "ld1rqw { z7.s }, p0/Z, [x21]\n"
+ "ld1rqw { z1.s }, p0/Z, [x22]\n"
+ "ld1rqw { z0.s }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z19.s, z7.s[0]\n"
+ "fmla z25.s, z19.s, z6.s[0]\n"
+ "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "fmla z26.s, z19.s, z5.s[0]\n"
+ "fmla z27.s, z19.s, z4.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "fmla z28.s, z19.s, z3.s[0]\n"
+ "fmla z29.s, z19.s, z2.s[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "fmla z30.s, z8.s, z6.s[0]\n"
- "fmla z31.s, z8.s, z7.s[0]\n"
+ "fmla z30.s, z19.s, z1.s[0]\n"
+ "fmla z31.s, z19.s, z0.s[0]\n"
"add x21, x21, #0x10\n"
"addvl x12, x12, #4\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
- "fmla z29.s, z9.s, z5.s[1]\n"
- "fmla z30.s, z9.s, z6.s[1]\n"
- "fmla z31.s, z9.s, z7.s[1]\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z29.s, z10.s, z5.s[2]\n"
- "fmla z30.s, z10.s, z6.s[2]\n"
- "fmla z31.s, z10.s, z7.s[2]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
- "fmla z29.s, z11.s, z5.s[3]\n"
- "fmla z30.s, z11.s, z6.s[3]\n"
- "fmla z31.s, z11.s, z7.s[3]\n"
+ "fmla z24.s, z18.s, z7.s[1]\n"
+ "fmla z25.s, z18.s, z6.s[1]\n"
+ "fmla z26.s, z18.s, z5.s[1]\n"
+ "fmla z27.s, z18.s, z4.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[1]\n"
+ "fmla z29.s, z18.s, z2.s[1]\n"
+ "fmla z30.s, z18.s, z1.s[1]\n"
+ "fmla z31.s, z18.s, z0.s[1]\n"
+ "fmla z24.s, z17.s, z7.s[2]\n"
+ "fmla z25.s, z17.s, z6.s[2]\n"
+ "fmla z26.s, z17.s, z5.s[2]\n"
+ "fmla z27.s, z17.s, z4.s[2]\n"
+ "fmla z28.s, z17.s, z3.s[2]\n"
+ "fmla z29.s, z17.s, z2.s[2]\n"
+ "fmla z30.s, z17.s, z1.s[2]\n"
+ "fmla z31.s, z17.s, z0.s[2]\n"
+ "fmla z24.s, z16.s, z7.s[3]\n"
+ "fmla z25.s, z16.s, z6.s[3]\n"
+ "fmla z26.s, z16.s, z5.s[3]\n"
+ "fmla z27.s, z16.s, z4.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[3]\n"
+ "fmla z29.s, z16.s, z2.s[3]\n"
+ "fmla z30.s, z16.s, z1.s[3]\n"
+ "fmla z31.s, z16.s, z0.s[3]\n"
"bgt 100b\n"
"101:" // Height 8: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x9\n"
@@ -1395,51 +1395,51 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z5.s }, p0/Z, [x23]\n"
"ld1rqw { z6.s }, p0/Z, [x22]\n"
"ld1rqw { z7.s }, p0/Z, [x21]\n"
- "ld1w { z8.s }, p2/Z, [x12]\n"
- "fmla z24.s, z8.s, z0.s[0]\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[0]\n"
+ "fmla z25.s, z16.s, z1.s[0]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
- "fmla z30.s, z8.s, z6.s[0]\n"
- "fmla z31.s, z8.s, z7.s[0]\n"
+ "fmla z26.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z16.s, z4.s[0]\n"
+ "fmla z29.s, z16.s, z5.s[0]\n"
+ "fmla z30.s, z16.s, z6.s[0]\n"
+ "fmla z31.s, z16.s, z7.s[0]\n"
"ble 102f\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z9.s, z0.s[1]\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
- "fmla z26.s, z9.s, z2.s[1]\n"
- "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "fmla z25.s, z16.s, z1.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[1]\n"
+ "fmla z27.s, z16.s, z3.s[1]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z9.s, z4.s[1]\n"
- "fmla z29.s, z9.s, z5.s[1]\n"
- "fmla z30.s, z9.s, z6.s[1]\n"
- "fmla z31.s, z9.s, z7.s[1]\n"
+ "fmla z28.s, z16.s, z4.s[1]\n"
+ "fmla z29.s, z16.s, z5.s[1]\n"
+ "fmla z30.s, z16.s, z6.s[1]\n"
+ "fmla z31.s, z16.s, z7.s[1]\n"
"ble 102f\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
"subs x9, x9, #0x1\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
- "fmla z25.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z16.s, z0.s[2]\n"
+ "fmla z25.s, z16.s, z1.s[2]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z27.s, z16.s, z3.s[2]\n"
"addvl x12, x12, #1\n"
- "fmla z28.s, z10.s, z4.s[2]\n"
- "fmla z29.s, z10.s, z5.s[2]\n"
- "fmla z30.s, z10.s, z6.s[2]\n"
- "fmla z31.s, z10.s, z7.s[2]\n"
+ "fmla z28.s, z16.s, z4.s[2]\n"
+ "fmla z29.s, z16.s, z5.s[2]\n"
+ "fmla z30.s, z16.s, z6.s[2]\n"
+ "fmla z31.s, z16.s, z7.s[2]\n"
"ble 102f\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z24.s, z11.s, z0.s[3]\n"
- "fmla z25.s, z11.s, z1.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "fmla z25.s, z16.s, z1.s[3]\n"
"addvl x12, x12, #1\n"
- "fmla z26.s, z11.s, z2.s[3]\n"
- "fmla z27.s, z11.s, z3.s[3]\n"
- "fmla z28.s, z11.s, z4.s[3]\n"
- "fmla z29.s, z11.s, z5.s[3]\n"
- "fmla z30.s, z11.s, z6.s[3]\n"
- "fmla z31.s, z11.s, z7.s[3]\n"
+ "fmla z26.s, z16.s, z2.s[3]\n"
+ "fmla z27.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z4.s[3]\n"
+ "fmla z29.s, z16.s, z5.s[3]\n"
+ "fmla z30.s, z16.s, z6.s[3]\n"
+ "fmla z31.s, z16.s, z7.s[3]\n"
"102:" // Height 8: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x10, x10, #0x1\n"
@@ -1500,12 +1500,11 @@ void sve_hybrid_fp32_mla_8x1VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"106:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index ab175a3758..66c106d2eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -100,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
index 8d05c1ffb3..2b2a0684f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -140,22 +140,22 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"b 5f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 4f\n"
- "ld1w { z9.s }, p6/Z, [x27]\n"
- "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x27]\n"
+ "ld1w { z20.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "zip1 z8.d, z21.d, z14.d\n"
+ "zip2 z14.d, z21.d, z14.d\n"
+ "ld1w { z23.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "zip1 z9.d, z20.d, z15.d\n"
+ "zip2 z15.d, z20.d, z15.d\n"
+ "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
+ "zip1 z10.d, z23.d, z16.d\n"
+ "zip2 z16.d, z23.d, z16.d\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "zip1 z12.d, z21.d, z18.d\n"
+ "zip2 z18.d, z21.d, z18.d\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
"b 5f\n"
@@ -177,11 +177,11 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -193,69 +193,69 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ "ld1rqw { z24.s }, p0/Z, [x24]\n"
+ ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
+ "uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z21.h }, p7/Z, [x28]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e708 // bfmmla z8.s, z24.h, z21.h\n"
+ ".inst 0x6474e70e // bfmmla z14.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6475e70a // bfmmla z10.s, z24.h, z21.h\n"
+ ".inst 0x6474e710 // bfmmla z16.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
"sub x25, x25, #0x4\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
"cmp x25, #0x4\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
+ ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
+ ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
+ ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
"add x24, x24, #0x10\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ "ld1rqw { z23.s }, p0/Z, [x24]\n"
+ ".inst 0x658abef7 // bfcvt z23.h, p7/M, z23.s\n"
+ "uzp1 z23.h, z23.h, z23.h\n"
+ "ld1h { z21.h }, p7/Z, [x28]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x6475e6e8 // bfmmla z8.s, z23.h, z21.h\n"
+ ".inst 0x6474e6ee // bfmmla z14.s, z23.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x6475e6e9 // bfmmla z9.s, z23.h, z21.h\n"
+ ".inst 0x6474e6ef // bfmmla z15.s, z23.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6475e6ea // bfmmla z10.s, z23.h, z21.h\n"
+ ".inst 0x6474e6f0 // bfmmla z16.s, z23.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ ".inst 0x6475e6eb // bfmmla z11.s, z23.h, z21.h\n"
+ ".inst 0x6474e6f1 // bfmmla z17.s, z23.h, z20.h\n"
+ "ld1h { z20.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6474e6ec // bfmmla z12.s, z23.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6476e6f2 // bfmmla z18.s, z23.h, z22.h\n"
+ ".inst 0x6475e6ed // bfmmla z13.s, z23.h, z21.h\n"
+ ".inst 0x6474e6f3 // bfmmla z19.s, z23.h, z20.h\n"
"addvl x28, x28, #-4\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -270,21 +270,21 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p7/Z, [x20]\n"
+ "ld1rw { z21.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p7/Z, [x20]\n"
- "fmin z8.s, p7/M, z8.s, z1.s\n"
- "fmin z9.s, p7/M, z9.s, z1.s\n"
- "fmin z10.s, p7/M, z10.s, z1.s\n"
- "fmin z11.s, p7/M, z11.s, z1.s\n"
- "fmin z12.s, p7/M, z12.s, z1.s\n"
- "fmin z13.s, p7/M, z13.s, z1.s\n"
- "fmax z8.s, p7/M, z8.s, z0.s\n"
- "fmax z9.s, p7/M, z9.s, z0.s\n"
- "fmax z10.s, p7/M, z10.s, z0.s\n"
- "fmax z11.s, p7/M, z11.s, z0.s\n"
- "fmax z12.s, p7/M, z12.s, z0.s\n"
- "fmax z13.s, p7/M, z13.s, z0.s\n"
+ "ld1rw { z20.s }, p7/Z, [x20]\n"
+ "fmin z8.s, p7/M, z8.s, z21.s\n"
+ "fmin z9.s, p7/M, z9.s, z21.s\n"
+ "fmin z10.s, p7/M, z10.s, z21.s\n"
+ "fmin z11.s, p7/M, z11.s, z21.s\n"
+ "fmin z12.s, p7/M, z12.s, z21.s\n"
+ "fmin z13.s, p7/M, z13.s, z21.s\n"
+ "fmax z8.s, p7/M, z8.s, z20.s\n"
+ "fmax z9.s, p7/M, z9.s, z20.s\n"
+ "fmax z10.s, p7/M, z10.s, z20.s\n"
+ "fmax z11.s, p7/M, z11.s, z20.s\n"
+ "fmax z12.s, p7/M, z12.s, z20.s\n"
+ "fmax z13.s, p7/M, z13.s, z20.s\n"
"12:" // Height 1: No activation
"st1w { z8.s }, p6, [x27]\n"
"st1w { z9.s }, p5, [x27, #1, MUL VL]\n"
@@ -340,29 +340,29 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x23, x27, x20, LSL #2\n"
- "ld1w { z9.s }, p6/Z, [x27]\n"
- "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "add x20, x27, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x27]\n"
+ "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
- "ld1w { z14.s }, p6/Z, [x23]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
+ "ld1w { z14.s }, p6/Z, [x20]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
+ "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
+ "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
+ "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "zip1 z12.d, z21.d, z18.d\n"
+ "zip2 z18.d, z21.d, z18.d\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
"b 18f\n"
@@ -384,12 +384,12 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -397,85 +397,85 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"21:" // Height 2: input setup done
"cmp x25, #0x4\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ "ld1rqw { z24.s }, p0/Z, [x24]\n"
+ "ld1rqw { z20.s }, p0/Z, [x23]\n"
+ ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
+ ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
+ "uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "uzp1 z20.h, z20.h, z20.h\n"
+ "trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
+ ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
+ ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
"sub x25, x25, #0x4\n"
"cmp x25, #0x4\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
+ ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
+ ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
+ ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"addvl x28, x28, #-4\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ "ld1rqw { z24.s }, p0/Z, [x24]\n"
+ "ld1rqw { z20.s }, p0/Z, [x23]\n"
+ ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n"
+ ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n"
+ "uzp1 z24.h, z24.h, z24.h\n"
+ "ld1h { z23.h }, p7/Z, [x28]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "uzp1 z20.h, z20.h, z20.h\n"
+ "trn1 z24.d, z24.d, z20.d\n"
+ "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n"
+ ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
+ "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n"
+ ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n"
+ "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
+ ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n"
+ ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n"
+ "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n"
+ ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n"
+ "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n"
+ ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
+ ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n"
+ ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -497,33 +497,33 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"uzp2 z13.d, z13.d, z19.d\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p7/Z, [x20]\n"
+ "ld1rw { z20.s }, p7/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p7/Z, [x20]\n"
- "fmin z4.s, p7/M, z4.s, z1.s\n"
- "fmin z14.s, p7/M, z14.s, z1.s\n"
- "fmin z15.s, p7/M, z15.s, z1.s\n"
- "fmin z16.s, p7/M, z16.s, z1.s\n"
- "fmin z17.s, p7/M, z17.s, z1.s\n"
- "fmin z18.s, p7/M, z18.s, z1.s\n"
- "fmin z8.s, p7/M, z8.s, z1.s\n"
- "fmin z9.s, p7/M, z9.s, z1.s\n"
- "fmin z10.s, p7/M, z10.s, z1.s\n"
- "fmin z11.s, p7/M, z11.s, z1.s\n"
- "fmin z12.s, p7/M, z12.s, z1.s\n"
- "fmin z13.s, p7/M, z13.s, z1.s\n"
- "fmax z4.s, p7/M, z4.s, z0.s\n"
- "fmax z14.s, p7/M, z14.s, z0.s\n"
- "fmax z15.s, p7/M, z15.s, z0.s\n"
- "fmax z16.s, p7/M, z16.s, z0.s\n"
- "fmax z17.s, p7/M, z17.s, z0.s\n"
- "fmax z18.s, p7/M, z18.s, z0.s\n"
- "fmax z8.s, p7/M, z8.s, z0.s\n"
- "fmax z9.s, p7/M, z9.s, z0.s\n"
- "fmax z10.s, p7/M, z10.s, z0.s\n"
- "fmax z11.s, p7/M, z11.s, z0.s\n"
- "fmax z12.s, p7/M, z12.s, z0.s\n"
- "fmax z13.s, p7/M, z13.s, z0.s\n"
+ "ld1rw { z19.s }, p7/Z, [x20]\n"
+ "fmin z4.s, p7/M, z4.s, z20.s\n"
+ "fmin z14.s, p7/M, z14.s, z20.s\n"
+ "fmin z15.s, p7/M, z15.s, z20.s\n"
+ "fmin z16.s, p7/M, z16.s, z20.s\n"
+ "fmin z17.s, p7/M, z17.s, z20.s\n"
+ "fmin z18.s, p7/M, z18.s, z20.s\n"
+ "fmin z8.s, p7/M, z8.s, z20.s\n"
+ "fmin z9.s, p7/M, z9.s, z20.s\n"
+ "fmin z10.s, p7/M, z10.s, z20.s\n"
+ "fmin z11.s, p7/M, z11.s, z20.s\n"
+ "fmin z12.s, p7/M, z12.s, z20.s\n"
+ "fmin z13.s, p7/M, z13.s, z20.s\n"
+ "fmax z4.s, p7/M, z4.s, z19.s\n"
+ "fmax z14.s, p7/M, z14.s, z19.s\n"
+ "fmax z15.s, p7/M, z15.s, z19.s\n"
+ "fmax z16.s, p7/M, z16.s, z19.s\n"
+ "fmax z17.s, p7/M, z17.s, z19.s\n"
+ "fmax z18.s, p7/M, z18.s, z19.s\n"
+ "fmax z8.s, p7/M, z8.s, z19.s\n"
+ "fmax z9.s, p7/M, z9.s, z19.s\n"
+ "fmax z10.s, p7/M, z10.s, z19.s\n"
+ "fmax z11.s, p7/M, z11.s, z19.s\n"
+ "fmax z12.s, p7/M, z12.s, z19.s\n"
+ "fmax z13.s, p7/M, z13.s, z19.s\n"
"25:" // Height 2: No activation
"st1w { z4.s }, p6, [x27]\n"
"st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
@@ -597,38 +597,38 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x23, x27, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z9.s }, p6/Z, [x27]\n"
- "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "add x21, x27, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z16.s }, p6/Z, [x27]\n"
+ "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
- "ld1w { z14.s }, p6/Z, [x23]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
- "ld1w { z21.s }, p6/Z, [x22]\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
- "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
- "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z14.s }, p6/Z, [x21]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
+ "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
+ "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
+ "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x20]\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
+ "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
- "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
"zip1 z21.d, z22.d, z27.d\n"
@@ -639,8 +639,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"zip2 z29.d, z24.d, z29.d\n"
"zip1 z24.d, z25.d, z30.d\n"
"zip2 z30.d, z25.d, z30.d\n"
- "zip1 z25.d, z4.d, z31.d\n"
- "zip2 z31.d, z4.d, z31.d\n"
+ "zip1 z25.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 31f\n"
"30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
@@ -672,13 +672,13 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -687,117 +687,117 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"34:" // Height 3: input setup done
"cmp x25, #0x4\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
"uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "trn1 z5.d, z5.d, z0.d\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
+ ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
"sub x25, x25, #0x4\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
"cmp x25, #0x4\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
+ ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
+ ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z0.s }, p0/Z, [x23]\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
"uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "trn1 z5.d, z5.d, z0.d\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n"
+ ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n"
+ ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n"
+ ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n"
+ ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n"
+ ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n"
+ ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n"
+ ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n"
+ ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n"
+ ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -826,45 +826,45 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"uzp1 z25.d, z25.d, z31.d\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p7/Z, [x20]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
"ld1rw { z0.s }, p7/Z, [x20]\n"
- "fmin z4.s, p7/M, z4.s, z1.s\n"
- "fmin z14.s, p7/M, z14.s, z1.s\n"
- "fmin z15.s, p7/M, z15.s, z1.s\n"
- "fmin z16.s, p7/M, z16.s, z1.s\n"
- "fmin z17.s, p7/M, z17.s, z1.s\n"
- "fmin z18.s, p7/M, z18.s, z1.s\n"
- "fmin z8.s, p7/M, z8.s, z1.s\n"
- "fmin z9.s, p7/M, z9.s, z1.s\n"
- "fmin z10.s, p7/M, z10.s, z1.s\n"
- "fmin z11.s, p7/M, z11.s, z1.s\n"
- "fmin z12.s, p7/M, z12.s, z1.s\n"
- "fmin z13.s, p7/M, z13.s, z1.s\n"
- "fmin z20.s, p7/M, z20.s, z1.s\n"
- "fmin z21.s, p7/M, z21.s, z1.s\n"
- "fmin z22.s, p7/M, z22.s, z1.s\n"
- "fmin z23.s, p7/M, z23.s, z1.s\n"
- "fmin z24.s, p7/M, z24.s, z1.s\n"
- "fmin z25.s, p7/M, z25.s, z1.s\n"
- "fmax z4.s, p7/M, z4.s, z0.s\n"
- "fmax z14.s, p7/M, z14.s, z0.s\n"
- "fmax z15.s, p7/M, z15.s, z0.s\n"
- "fmax z16.s, p7/M, z16.s, z0.s\n"
- "fmax z17.s, p7/M, z17.s, z0.s\n"
- "fmax z18.s, p7/M, z18.s, z0.s\n"
- "fmax z8.s, p7/M, z8.s, z0.s\n"
- "fmax z9.s, p7/M, z9.s, z0.s\n"
- "fmax z10.s, p7/M, z10.s, z0.s\n"
- "fmax z11.s, p7/M, z11.s, z0.s\n"
- "fmax z12.s, p7/M, z12.s, z0.s\n"
- "fmax z13.s, p7/M, z13.s, z0.s\n"
- "fmax z20.s, p7/M, z20.s, z0.s\n"
- "fmax z21.s, p7/M, z21.s, z0.s\n"
- "fmax z22.s, p7/M, z22.s, z0.s\n"
- "fmax z23.s, p7/M, z23.s, z0.s\n"
- "fmax z24.s, p7/M, z24.s, z0.s\n"
- "fmax z25.s, p7/M, z25.s, z0.s\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z19.s }, p7/Z, [x20]\n"
+ "fmin z4.s, p7/M, z4.s, z0.s\n"
+ "fmin z14.s, p7/M, z14.s, z0.s\n"
+ "fmin z15.s, p7/M, z15.s, z0.s\n"
+ "fmin z16.s, p7/M, z16.s, z0.s\n"
+ "fmin z17.s, p7/M, z17.s, z0.s\n"
+ "fmin z18.s, p7/M, z18.s, z0.s\n"
+ "fmin z8.s, p7/M, z8.s, z0.s\n"
+ "fmin z9.s, p7/M, z9.s, z0.s\n"
+ "fmin z10.s, p7/M, z10.s, z0.s\n"
+ "fmin z11.s, p7/M, z11.s, z0.s\n"
+ "fmin z12.s, p7/M, z12.s, z0.s\n"
+ "fmin z13.s, p7/M, z13.s, z0.s\n"
+ "fmin z20.s, p7/M, z20.s, z0.s\n"
+ "fmin z21.s, p7/M, z21.s, z0.s\n"
+ "fmin z22.s, p7/M, z22.s, z0.s\n"
+ "fmin z23.s, p7/M, z23.s, z0.s\n"
+ "fmin z24.s, p7/M, z24.s, z0.s\n"
+ "fmin z25.s, p7/M, z25.s, z0.s\n"
+ "fmax z4.s, p7/M, z4.s, z19.s\n"
+ "fmax z14.s, p7/M, z14.s, z19.s\n"
+ "fmax z15.s, p7/M, z15.s, z19.s\n"
+ "fmax z16.s, p7/M, z16.s, z19.s\n"
+ "fmax z17.s, p7/M, z17.s, z19.s\n"
+ "fmax z18.s, p7/M, z18.s, z19.s\n"
+ "fmax z8.s, p7/M, z8.s, z19.s\n"
+ "fmax z9.s, p7/M, z9.s, z19.s\n"
+ "fmax z10.s, p7/M, z10.s, z19.s\n"
+ "fmax z11.s, p7/M, z11.s, z19.s\n"
+ "fmax z12.s, p7/M, z12.s, z19.s\n"
+ "fmax z13.s, p7/M, z13.s, z19.s\n"
+ "fmax z20.s, p7/M, z20.s, z19.s\n"
+ "fmax z21.s, p7/M, z21.s, z19.s\n"
+ "fmax z22.s, p7/M, z22.s, z19.s\n"
+ "fmax z23.s, p7/M, z23.s, z19.s\n"
+ "fmax z24.s, p7/M, z24.s, z19.s\n"
+ "fmax z25.s, p7/M, z25.s, z19.s\n"
"38:" // Height 3: No activation
"st1w { z4.s }, p6, [x27]\n"
"st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
@@ -947,57 +947,57 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x23, x27, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z9.s }, p6/Z, [x27]\n"
+ "add x22, x27, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
- "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
- "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "ld1w { z16.s }, p6/Z, [x27]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
"ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
- "ld1w { z14.s }, p6/Z, [x23]\n"
- "zip1 z8.d, z9.d, z14.d\n"
- "zip2 z14.d, z9.d, z14.d\n"
- "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z15.d\n"
- "zip2 z15.d, z10.d, z15.d\n"
- "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
- "zip1 z10.d, z11.d, z16.d\n"
- "zip2 z16.d, z11.d, z16.d\n"
- "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
- "ld1w { z21.s }, p6/Z, [x22]\n"
- "zip1 z11.d, z12.d, z17.d\n"
- "zip2 z17.d, z12.d, z17.d\n"
- "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
- "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
- "zip1 z12.d, z13.d, z18.d\n"
- "zip2 z18.d, z13.d, z18.d\n"
- "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
- "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z14.s }, p6/Z, [x22]\n"
+ "zip1 z8.d, z16.d, z14.d\n"
+ "zip2 z14.d, z16.d, z14.d\n"
+ "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z15.d\n"
+ "zip2 z15.d, z17.d, z15.d\n"
+ "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "zip1 z10.d, z19.d, z16.d\n"
+ "zip2 z16.d, z19.d, z16.d\n"
+ "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z21.s }, p6/Z, [x21]\n"
+ "zip1 z11.d, z22.d, z17.d\n"
+ "zip2 z17.d, z22.d, z17.d\n"
+ "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "zip1 z12.d, z24.d, z18.d\n"
+ "zip2 z18.d, z24.d, z18.d\n"
+ "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
"zip1 z13.d, z20.d, z19.d\n"
"zip2 z19.d, z20.d, z19.d\n"
- "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
- "ld1w { z26.s }, p6/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z26.s }, p6/Z, [x20]\n"
"zip1 z20.d, z21.d, z26.d\n"
"zip2 z26.d, z21.d, z26.d\n"
- "ld1w { z27.s }, p5/Z, [x21, #1, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
"zip1 z21.d, z22.d, z27.d\n"
"zip2 z27.d, z22.d, z27.d\n"
- "ld1w { z29.s }, p3/Z, [x21, #3, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #4, MUL VL]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
"zip1 z22.d, z23.d, z28.d\n"
"zip2 z28.d, z23.d, z28.d\n"
- "ld1w { z31.s }, p1/Z, [x21, #5, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
"zip1 z23.d, z24.d, z29.d\n"
"zip2 z29.d, z24.d, z29.d\n"
"zip1 z24.d, z25.d, z30.d\n"
"zip2 z30.d, z25.d, z30.d\n"
- "zip1 z25.d, z4.d, z31.d\n"
- "zip2 z31.d, z4.d, z31.d\n"
+ "zip1 z25.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 44f\n"
"43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
@@ -1029,14 +1029,14 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20, LSL #2\n"
@@ -1046,127 +1046,127 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"47:" // Height 4: input setup done
"cmp x25, #0x4\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "ld1rqw { z3.s }, p0/Z, [x21]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
"sub x25, x25, #0x4\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
"cmp x25, #0x4\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
"add x21, x21, #0x10\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x25\n"
- "ld1rqw { z0.s }, p0/Z, [x24]\n"
- "ld1rqw { z1.s }, p0/Z, [x23]\n"
- ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x22]\n"
- "ld1rqw { z3.s }, p0/Z, [x21]\n"
- ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n"
- ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n"
- ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z4.h }, p7/Z, [x28]\n"
- "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
- "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
- ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
- ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+ "ld1rqw { z7.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x23]\n"
+ ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n"
+ ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n"
+ ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z3.h }, p7/Z, [x28]\n"
+ "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n"
- "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n"
- ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n"
+ "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n"
+ "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n"
"addvl x28, x28, #-4\n"
- ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
- ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n"
- ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n"
- ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n"
- ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n"
- ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n"
- ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n"
+ ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n"
+ ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n"
+ ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n"
+ ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n"
+ ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x26, x26, #0x1\n"
@@ -1295,7 +1295,6 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"54:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1303,4 +1302,4 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index b7c9aca9dd..15b7dd721c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -100,5 +99,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
index 23d7ff9c3b..0d2b47ec39 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
@@ -133,16 +133,16 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 5f\n"
"3:" // Height 1: no bias
"tbz %x[flags], #0, 4f\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z16.d, z12.d\n"
+ "zip2 z12.d, z16.d, z12.d\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 5f\n"
@@ -160,11 +160,11 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 7f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 8f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -176,51 +176,51 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"ble 10f\n"
"9:" // Height 1: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqw { z18.s }, p0/Z, [x26]\n"
+ ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
+ "uzp1 z18.h, z18.h, z18.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"add x26, x26, #0x10\n"
"addvl x10, x10, #8\n"
"bgt 9b\n"
"10:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
+ "ld1rqw { z18.s }, p0/Z, [x26]\n"
+ ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
+ "uzp1 z18.h, z18.h, z18.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n"
+ ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n"
+ ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n"
+ ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n"
+ ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n"
"addvl x10, x10, #8\n"
"11:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -233,17 +233,17 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 12f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"12:" // Height 1: No activation
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -287,21 +287,21 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"16:" // Height 2: no bias
"tbz %x[flags], #0, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x9, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 18f\n"
@@ -319,12 +319,12 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 20f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 21f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -332,67 +332,67 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 21f\n"
"20:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
"21:" // Height 2: input setup done
"cmp x27, #0x4\n"
"ble 23f\n"
"22:" // Height 2: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqw { z19.s }, p0/Z, [x26]\n"
+ "ld1rqw { z18.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab673 // bfcvt z19.h, p5/M, z19.s\n"
+ ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
+ "uzp1 z19.h, z19.h, z19.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z18.h, z18.h, z18.h\n"
+ "trn1 z19.d, z19.d, z18.d\n"
+ ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
+ ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
+ ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"addvl x10, x10, #8\n"
"bgt 22b\n"
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ "ld1rqw { z19.s }, p0/Z, [x26]\n"
+ "ld1rqw { z18.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab673 // bfcvt z19.h, p5/M, z19.s\n"
+ ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n"
+ "uzp1 z19.h, z19.h, z19.h\n"
+ "ld1h { z17.h }, p5/Z, [x10]\n"
+ "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z18.h, z18.h, z18.h\n"
+ "trn1 z19.d, z19.d, z18.d\n"
+ ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n"
+ "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n"
+ "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
+ ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n"
"24:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -410,25 +410,25 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp2 z11.d, z11.d, z15.d\n"
"tbz %x[flags], #1, 25f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z17.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z6.s, p5/M, z6.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmax z6.s, p5/M, z6.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
+ "ld1rw { z16.s }, p5/Z, [x20]\n"
+ "fmin z6.s, p5/M, z6.s, z17.s\n"
+ "fmin z12.s, p5/M, z12.s, z17.s\n"
+ "fmin z13.s, p5/M, z13.s, z17.s\n"
+ "fmin z14.s, p5/M, z14.s, z17.s\n"
+ "fmin z8.s, p5/M, z8.s, z17.s\n"
+ "fmin z9.s, p5/M, z9.s, z17.s\n"
+ "fmin z10.s, p5/M, z10.s, z17.s\n"
+ "fmin z11.s, p5/M, z11.s, z17.s\n"
+ "fmax z6.s, p5/M, z6.s, z16.s\n"
+ "fmax z12.s, p5/M, z12.s, z16.s\n"
+ "fmax z13.s, p5/M, z13.s, z16.s\n"
+ "fmax z14.s, p5/M, z14.s, z16.s\n"
+ "fmax z8.s, p5/M, z8.s, z16.s\n"
+ "fmax z9.s, p5/M, z9.s, z16.s\n"
+ "fmax z10.s, p5/M, z10.s, z16.s\n"
+ "fmax z11.s, p5/M, z11.s, z16.s\n"
"25:" // Height 2: No activation
"st1w { z6.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -484,28 +484,28 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"29:" // Height 3: no bias
"tbz %x[flags], #0, 30f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -537,13 +537,13 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -552,91 +552,91 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
"34:" // Height 3: input setup done
"cmp x27, #0x4\n"
"ble 36f\n"
"35:" // Height 3: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "ld1rqw { z28.s }, p0/Z, [x26]\n"
+ "ld1rqw { z27.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
+ "ld1rqw { z26.s }, p0/Z, [x24]\n"
+ ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "uzp1 z27.h, z27.h, z27.h\n"
+ ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
"sub x27, x27, #0x4\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
+ "trn1 z28.d, z28.d, z27.d\n"
+ "uzp1 z26.h, z26.h, z26.h\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
"cmp x27, #0x4\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"bgt 35b\n"
"36:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ "ld1rqw { z28.s }, p0/Z, [x26]\n"
+ "ld1rqw { z27.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
+ "ld1rqw { z26.s }, p0/Z, [x24]\n"
+ ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "uzp1 z27.h, z27.h, z27.h\n"
+ ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "trn1 z28.d, z28.d, z27.d\n"
+ "uzp1 z26.h, z26.h, z26.h\n"
+ ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n"
+ ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n"
+ ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n"
+ ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n"
+ ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n"
+ ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n"
+ ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
+ ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n"
+ ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n"
+ ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n"
"37:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -659,33 +659,33 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 38f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z25.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z6.s, p5/M, z6.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z6.s, p5/M, z6.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
+ "fmin z6.s, p5/M, z6.s, z25.s\n"
+ "fmin z12.s, p5/M, z12.s, z25.s\n"
+ "fmin z13.s, p5/M, z13.s, z25.s\n"
+ "fmin z14.s, p5/M, z14.s, z25.s\n"
+ "fmin z8.s, p5/M, z8.s, z25.s\n"
+ "fmin z9.s, p5/M, z9.s, z25.s\n"
+ "fmin z10.s, p5/M, z10.s, z25.s\n"
+ "fmin z11.s, p5/M, z11.s, z25.s\n"
+ "fmin z16.s, p5/M, z16.s, z25.s\n"
+ "fmin z17.s, p5/M, z17.s, z25.s\n"
+ "fmin z18.s, p5/M, z18.s, z25.s\n"
+ "fmin z19.s, p5/M, z19.s, z25.s\n"
+ "fmax z6.s, p5/M, z6.s, z24.s\n"
+ "fmax z12.s, p5/M, z12.s, z24.s\n"
+ "fmax z13.s, p5/M, z13.s, z24.s\n"
+ "fmax z14.s, p5/M, z14.s, z24.s\n"
+ "fmax z8.s, p5/M, z8.s, z24.s\n"
+ "fmax z9.s, p5/M, z9.s, z24.s\n"
+ "fmax z10.s, p5/M, z10.s, z24.s\n"
+ "fmax z11.s, p5/M, z11.s, z24.s\n"
+ "fmax z16.s, p5/M, z16.s, z24.s\n"
+ "fmax z17.s, p5/M, z17.s, z24.s\n"
+ "fmax z18.s, p5/M, z18.s, z24.s\n"
+ "fmax z19.s, p5/M, z19.s, z24.s\n"
"38:" // Height 3: No activation
"st1w { z6.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -745,37 +745,37 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"42:" // Height 4: no bias
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -803,14 +803,14 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -820,101 +820,101 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 47f\n"
"46:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
"47:" // Height 4: input setup done
"cmp x27, #0x4\n"
"ble 49f\n"
"48:" // Height 4: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
+ "ld1rqw { z29.s }, p0/Z, [x26]\n"
+ "ld1rqw { z28.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab7bd // bfcvt z29.h, p5/M, z29.s\n"
+ "ld1rqw { z27.s }, p0/Z, [x24]\n"
+ "ld1rqw { z26.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
+ ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
+ ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
+ "uzp1 z29.h, z29.h, z29.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
+ "uzp1 z27.h, z27.h, z27.h\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
+ "uzp1 z26.h, z26.h, z26.h\n"
+ "trn1 z29.d, z29.d, z28.d\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
"add x26, x26, #0x10\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
+ "trn1 z27.d, z27.d, z26.d\n"
+ ".inst 0x6479e770 // bfmmla z16.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6478e774 // bfmmla z20.s, z27.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
"add x25, x25, #0x10\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6479e771 // bfmmla z17.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
+ ".inst 0x6478e775 // bfmmla z21.s, z27.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
"add x23, x23, #0x10\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ ".inst 0x6479e772 // bfmmla z18.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e776 // bfmmla z22.s, z27.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
+ ".inst 0x6479e773 // bfmmla z19.s, z27.h, z25.h\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e777 // bfmmla z23.s, z27.h, z24.h\n"
"bgt 48b\n"
"49:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ "ld1rqw { z29.s }, p0/Z, [x26]\n"
+ "ld1rqw { z28.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab7bd // bfcvt z29.h, p5/M, z29.s\n"
+ "ld1rqw { z27.s }, p0/Z, [x24]\n"
+ "ld1rqw { z26.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n"
+ ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n"
+ ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n"
+ "uzp1 z29.h, z29.h, z29.h\n"
+ "ld1h { z25.h }, p5/Z, [x10]\n"
+ "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z28.h, z28.h, z28.h\n"
+ "uzp1 z27.h, z27.h, z27.h\n"
+ "uzp1 z26.h, z26.h, z26.h\n"
+ "trn1 z29.d, z29.d, z28.d\n"
+ ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n"
+ "trn1 z27.d, z27.d, z26.d\n"
+ ".inst 0x6479e770 // bfmmla z16.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6478e774 // bfmmla z20.s, z27.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n"
+ ".inst 0x6479e771 // bfmmla z17.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6478e775 // bfmmla z21.s, z27.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n"
+ ".inst 0x6479e772 // bfmmla z18.s, z27.h, z25.h\n"
+ ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n"
+ "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6478e776 // bfmmla z22.s, z27.h, z24.h\n"
+ "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
+ ".inst 0x6479e773 // bfmmla z19.s, z27.h, z25.h\n"
+ ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n"
+ ".inst 0x6478e777 // bfmmla z23.s, z27.h, z24.h\n"
"50:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -942,41 +942,41 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp2 z19.d, z19.d, z23.d\n"
"tbz %x[flags], #1, 51f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
+ "ld1rw { z24.s }, p5/Z, [x20]\n"
"add x20, %x[args_ptr], %[offset_min]\n"
- "ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z6.s, p5/M, z6.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmax z6.s, p5/M, z6.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
+ "ld1rw { z23.s }, p5/Z, [x20]\n"
+ "fmin z6.s, p5/M, z6.s, z24.s\n"
+ "fmin z12.s, p5/M, z12.s, z24.s\n"
+ "fmin z13.s, p5/M, z13.s, z24.s\n"
+ "fmin z14.s, p5/M, z14.s, z24.s\n"
+ "fmin z8.s, p5/M, z8.s, z24.s\n"
+ "fmin z9.s, p5/M, z9.s, z24.s\n"
+ "fmin z10.s, p5/M, z10.s, z24.s\n"
+ "fmin z11.s, p5/M, z11.s, z24.s\n"
+ "fmin z15.s, p5/M, z15.s, z24.s\n"
+ "fmin z20.s, p5/M, z20.s, z24.s\n"
+ "fmin z21.s, p5/M, z21.s, z24.s\n"
+ "fmin z22.s, p5/M, z22.s, z24.s\n"
+ "fmin z16.s, p5/M, z16.s, z24.s\n"
+ "fmin z17.s, p5/M, z17.s, z24.s\n"
+ "fmin z18.s, p5/M, z18.s, z24.s\n"
+ "fmin z19.s, p5/M, z19.s, z24.s\n"
+ "fmax z6.s, p5/M, z6.s, z23.s\n"
+ "fmax z12.s, p5/M, z12.s, z23.s\n"
+ "fmax z13.s, p5/M, z13.s, z23.s\n"
+ "fmax z14.s, p5/M, z14.s, z23.s\n"
+ "fmax z8.s, p5/M, z8.s, z23.s\n"
+ "fmax z9.s, p5/M, z9.s, z23.s\n"
+ "fmax z10.s, p5/M, z10.s, z23.s\n"
+ "fmax z11.s, p5/M, z11.s, z23.s\n"
+ "fmax z15.s, p5/M, z15.s, z23.s\n"
+ "fmax z20.s, p5/M, z20.s, z23.s\n"
+ "fmax z21.s, p5/M, z21.s, z23.s\n"
+ "fmax z22.s, p5/M, z22.s, z23.s\n"
+ "fmax z16.s, p5/M, z16.s, z23.s\n"
+ "fmax z17.s, p5/M, z17.s, z23.s\n"
+ "fmax z18.s, p5/M, z18.s, z23.s\n"
+ "fmax z19.s, p5/M, z19.s, z23.s\n"
"51:" // Height 4: No activation
"st1w { z6.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1048,54 +1048,54 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"55:" // Height 5: no bias
"tbz %x[flags], #0, 56f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 57f\n"
"56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
@@ -1127,15 +1127,15 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 59f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 60f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1146,125 +1146,125 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 60f\n"
"59:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
"60:" // Height 5: input setup done
"cmp x27, #0x4\n"
"ble 62f\n"
"61:" // Height 5: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x26]\n"
+ "ld1rqw { z5.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
+ ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
+ "ld1rqw { z2.s }, p0/Z, [x22]\n"
".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
"sub x27, x27, #0x4\n"
"uzp1 z3.h, z3.h, z3.h\n"
- ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
+ ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
"cmp x27, #0x4\n"
"add x26, x26, #0x10\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
+ "trn1 z6.d, z6.d, z5.d\n"
+ "trn1 z4.d, z4.d, z3.d\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
"add x25, x25, #0x10\n"
- "uzp1 z4.h, z4.h, z4.h\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
+ "uzp1 z2.h, z2.h, z2.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
"add x22, x22, #0x10\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
+ ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
+ ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
+ ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
+ ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
+ ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"bgt 61b\n"
"62:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
+ "ld1rqw { z6.s }, p0/Z, [x26]\n"
+ "ld1rqw { z5.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
+ "ld1rqw { z4.s }, p0/Z, [x24]\n"
"ld1rqw { z3.s }, p0/Z, [x23]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z3.h, z3.h, z3.h\n"
+ ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
- "trn1 z0.d, z0.d, z1.d\n"
- "trn1 z2.d, z2.d, z3.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
+ "ld1rqw { z2.s }, p0/Z, [x22]\n"
+ ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
"uzp1 z4.h, z4.h, z4.h\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z3.h, z3.h, z3.h\n"
+ ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
+ "trn1 z6.d, z6.d, z5.d\n"
+ "trn1 z4.d, z4.d, z3.d\n"
+ ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n"
+ "uzp1 z2.h, z2.h, z2.h\n"
+ ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n"
+ ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n"
+ ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n"
+ ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n"
+ ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n"
+ ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n"
+ ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n"
+ ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n"
+ ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n"
+ ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n"
+ ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n"
+ ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n"
+ ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n"
+ ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n"
+ ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n"
+ ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n"
+ ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n"
"63:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1297,49 +1297,49 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"uzp1 z27.d, z27.d, z31.d\n"
"tbz %x[flags], #1, 64f\n"
"add x20, %x[args_ptr], %[offset_max]\n"
- "ld1rw { z1.s }, p5/Z, [x20]\n"
- "add x20, %x[args_ptr], %[offset_min]\n"
"ld1rw { z0.s }, p5/Z, [x20]\n"
- "fmin z6.s, p5/M, z6.s, z1.s\n"
- "fmin z12.s, p5/M, z12.s, z1.s\n"
- "fmin z13.s, p5/M, z13.s, z1.s\n"
- "fmin z14.s, p5/M, z14.s, z1.s\n"
- "fmin z8.s, p5/M, z8.s, z1.s\n"
- "fmin z9.s, p5/M, z9.s, z1.s\n"
- "fmin z10.s, p5/M, z10.s, z1.s\n"
- "fmin z11.s, p5/M, z11.s, z1.s\n"
- "fmin z15.s, p5/M, z15.s, z1.s\n"
- "fmin z20.s, p5/M, z20.s, z1.s\n"
- "fmin z21.s, p5/M, z21.s, z1.s\n"
- "fmin z22.s, p5/M, z22.s, z1.s\n"
- "fmin z16.s, p5/M, z16.s, z1.s\n"
- "fmin z17.s, p5/M, z17.s, z1.s\n"
- "fmin z18.s, p5/M, z18.s, z1.s\n"
- "fmin z19.s, p5/M, z19.s, z1.s\n"
- "fmin z24.s, p5/M, z24.s, z1.s\n"
- "fmin z25.s, p5/M, z25.s, z1.s\n"
- "fmin z26.s, p5/M, z26.s, z1.s\n"
- "fmin z27.s, p5/M, z27.s, z1.s\n"
- "fmax z6.s, p5/M, z6.s, z0.s\n"
- "fmax z12.s, p5/M, z12.s, z0.s\n"
- "fmax z13.s, p5/M, z13.s, z0.s\n"
- "fmax z14.s, p5/M, z14.s, z0.s\n"
- "fmax z8.s, p5/M, z8.s, z0.s\n"
- "fmax z9.s, p5/M, z9.s, z0.s\n"
- "fmax z10.s, p5/M, z10.s, z0.s\n"
- "fmax z11.s, p5/M, z11.s, z0.s\n"
- "fmax z15.s, p5/M, z15.s, z0.s\n"
- "fmax z20.s, p5/M, z20.s, z0.s\n"
- "fmax z21.s, p5/M, z21.s, z0.s\n"
- "fmax z22.s, p5/M, z22.s, z0.s\n"
- "fmax z16.s, p5/M, z16.s, z0.s\n"
- "fmax z17.s, p5/M, z17.s, z0.s\n"
- "fmax z18.s, p5/M, z18.s, z0.s\n"
- "fmax z19.s, p5/M, z19.s, z0.s\n"
- "fmax z24.s, p5/M, z24.s, z0.s\n"
- "fmax z25.s, p5/M, z25.s, z0.s\n"
- "fmax z26.s, p5/M, z26.s, z0.s\n"
- "fmax z27.s, p5/M, z27.s, z0.s\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z23.s }, p5/Z, [x20]\n"
+ "fmin z6.s, p5/M, z6.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z6.s, p5/M, z6.s, z23.s\n"
+ "fmax z12.s, p5/M, z12.s, z23.s\n"
+ "fmax z13.s, p5/M, z13.s, z23.s\n"
+ "fmax z14.s, p5/M, z14.s, z23.s\n"
+ "fmax z8.s, p5/M, z8.s, z23.s\n"
+ "fmax z9.s, p5/M, z9.s, z23.s\n"
+ "fmax z10.s, p5/M, z10.s, z23.s\n"
+ "fmax z11.s, p5/M, z11.s, z23.s\n"
+ "fmax z15.s, p5/M, z15.s, z23.s\n"
+ "fmax z20.s, p5/M, z20.s, z23.s\n"
+ "fmax z21.s, p5/M, z21.s, z23.s\n"
+ "fmax z22.s, p5/M, z22.s, z23.s\n"
+ "fmax z16.s, p5/M, z16.s, z23.s\n"
+ "fmax z17.s, p5/M, z17.s, z23.s\n"
+ "fmax z18.s, p5/M, z18.s, z23.s\n"
+ "fmax z19.s, p5/M, z19.s, z23.s\n"
+ "fmax z24.s, p5/M, z24.s, z23.s\n"
+ "fmax z25.s, p5/M, z25.s, z23.s\n"
+ "fmax z26.s, p5/M, z26.s, z23.s\n"
+ "fmax z27.s, p5/M, z27.s, z23.s\n"
"64:" // Height 5: No activation
"st1w { z6.s }, p4, [x9]\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1418,59 +1418,59 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"68:" // Height 6: no bias
"tbz %x[flags], #0, 69f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x25, x9, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
+ "add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x21, x22, x20, LSL #2\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x25]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x24]\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "zip1 z8.d, z17.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x23]\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip2 z14.d, z20.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x23]\n"
+ "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
"zip2 z15.d, z16.d, z15.d\n"
"zip1 z16.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x21]\n"
"zip2 z21.d, z18.d, z21.d\n"
"zip1 z18.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 70f\n"
"69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
@@ -1502,16 +1502,16 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 72f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 73f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20, LSL #2\n"
@@ -1523,135 +1523,135 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"b 73f\n"
"72:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
- "add x21, x22, x20, LSL #2\n"
+ "add x25, x26, x21, LSL #2\n"
+ "add x24, x25, x21, LSL #2\n"
+ "add x23, x24, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "add x21, x22, x21, LSL #2\n"
"73:" // Height 6: input setup done
"cmp x27, #0x4\n"
"ble 75f\n"
"74:" // Height 6: Multiply loop: Main loop head
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z5.s }, p0/Z, [x21]\n"
- ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
+ "ld1rqw { z7.s }, p0/Z, [x26]\n"
+ "ld1rqw { z6.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4e7 // bfcvt z7.h, p5/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "ld1rqw { z2.s }, p0/Z, [x21]\n"
+ ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
+ ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
+ ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
+ "uzp1 z5.h, z5.h, z5.h\n"
"sub x27, x27, #0x4\n"
"cmp x27, #0x4\n"
- "uzp1 z3.h, z3.h, z3.h\n"
"uzp1 z4.h, z4.h, z4.h\n"
+ "uzp1 z3.h, z3.h, z3.h\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "uzp1 z5.h, z5.h, z5.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
+ "uzp1 z2.h, z2.h, z2.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
"add x24, x24, #0x10\n"
- "trn1 z2.d, z2.d, z3.d\n"
- "trn1 z4.d, z4.d, z5.d\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ "trn1 z3.d, z3.d, z2.d\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
"add x23, x23, #0x10\n"
- ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
"add x22, x22, #0x10\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
"add x21, x21, #0x10\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"bgt 74b\n"
"75:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.s, XZR, x27\n"
- "ld1rqw { z0.s }, p0/Z, [x26]\n"
- "ld1rqw { z1.s }, p0/Z, [x25]\n"
- ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n"
- "ld1rqw { z2.s }, p0/Z, [x24]\n"
- "ld1rqw { z3.s }, p0/Z, [x23]\n"
- ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n"
- ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "ld1rqw { z5.s }, p0/Z, [x21]\n"
- ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
- ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
+ "ld1rqw { z7.s }, p0/Z, [x26]\n"
+ "ld1rqw { z6.s }, p0/Z, [x25]\n"
+ ".inst 0x658ab4e7 // bfcvt z7.h, p5/M, z7.s\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n"
".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n"
- "uzp1 z0.h, z0.h, z0.h\n"
- "ld1h { z6.h }, p5/Z, [x10]\n"
- "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
- "uzp1 z1.h, z1.h, z1.h\n"
- "uzp1 z2.h, z2.h, z2.h\n"
- "uzp1 z3.h, z3.h, z3.h\n"
- "uzp1 z4.h, z4.h, z4.h\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "ld1rqw { z2.s }, p0/Z, [x21]\n"
+ ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n"
+ ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n"
+ ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n"
+ "uzp1 z7.h, z7.h, z7.h\n"
+ "ld1h { z1.h }, p5/Z, [x10]\n"
+ "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z6.h, z6.h, z6.h\n"
"uzp1 z5.h, z5.h, z5.h\n"
- "trn1 z0.d, z0.d, z1.d\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "trn1 z2.d, z2.d, z3.d\n"
- "trn1 z4.d, z4.d, z5.d\n"
- ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n"
- ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n"
- ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n"
- ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
- ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n"
- ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n"
- ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n"
- "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n"
- ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n"
- "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "uzp1 z4.h, z4.h, z4.h\n"
+ "uzp1 z3.h, z3.h, z3.h\n"
+ "uzp1 z2.h, z2.h, z2.h\n"
+ "trn1 z7.d, z7.d, z6.d\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
+ "trn1 z5.d, z5.d, z4.d\n"
+ "trn1 z3.d, z3.d, z2.d\n"
+ ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n"
+ ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n"
+ ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n"
+ ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n"
+ ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n"
+ ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n"
+ "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n"
+ ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n"
+ "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n"
- ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n"
- ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n"
- ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n"
+ ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n"
+ ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n"
+ ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n"
+ ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n"
"76:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1782,7 +1782,6 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1790,4 +1789,4 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c8a7d66f28..ffc1606b3f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, int8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -97,5 +96,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
index 562b2759aa..b7c523466e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -104,11 +104,11 @@ void sve_hybrid_s8qa_dot_4x4VL (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -121,39 +121,39 @@ void sve_hybrid_s8qa_dot_4x4VL (
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z17.s, z21.b, z0.b[0]\n"
+ "sdot z18.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "sdot z16.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "sdot z17.s, z21.b, z0.b[1]\n"
+ "sdot z18.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "sdot z16.s, z22.b, z0.b[2]\n"
+ "sdot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[2]\n"
+ "sdot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "sdot z16.s, z22.b, z0.b[3]\n"
+ "sdot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[3]\n"
+ "sdot z19.s, z20.b, z0.b[3]\n"
"add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -164,47 +164,47 @@ void sve_hybrid_s8qa_dot_4x4VL (
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
+ "ld1b { z22.b }, p2/Z, [x28]\n"
"subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z16.s, z22.b, z0.b[0]\n"
+ "sdot z17.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[0]\n"
+ "sdot z19.s, z20.b, z0.b[0]\n"
"addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z16.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z22.b, z0.b[1]\n"
+ "sdot z18.s, z21.b, z0.b[1]\n"
+ "sdot z19.s, z20.b, z0.b[1]\n"
"addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z16.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z22.b, z0.b[2]\n"
+ "sdot z18.s, z21.b, z0.b[2]\n"
+ "sdot z19.s, z20.b, z0.b[2]\n"
"addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z0.b[3]\n"
+ "sdot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[3]\n"
+ "sdot z19.s, z20.b, z0.b[3]\n"
"addvl x28, x28, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
@@ -218,71 +218,71 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
"saddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
- "neg z1.s, p2/M, z1.s\n"
- "mul z11.s, p2/M, z11.s, z1.s\n"
+ "neg z20.s, p2/M, z20.s\n"
+ "mul z11.s, p2/M, z11.s, z20.s\n"
"12:" // Height 1: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [x10]\n"
+ "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z23.s\n"
+ "add z17.s, z17.s, z22.s\n"
+ "add z18.s, z18.s, z21.s\n"
+ "add z19.s, z19.s, z20.s\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
+ ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n"
"addvl x10, x10, #4\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
+ ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
"tbz %x[flags], #5, 13f\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z23.d, z16.d, z0.d\n"
+ "and z22.d, z17.d, z0.d\n"
+ "and z21.d, z18.d, z0.d\n"
+ "and z20.d, z19.d, z0.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "sqadd z17.s, z17.s, z22.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
"13:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z20.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z20.s\n"
+ "add z18.s, z18.s, z20.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z4.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z20.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z21.s\n"
+ "smin z17.s, p2/M, z17.s, z21.s\n"
+ "smin z18.s, p2/M, z18.s, z21.s\n"
+ "smin z19.s, p2/M, z19.s, z21.s\n"
+ "smax z16.s, p2/M, z16.s, z20.s\n"
+ "smax z17.s, p2/M, z17.s, z20.s\n"
+ "smax z18.s, p2/M, z18.s, z20.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z20.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
"st1b { z16.b }, p1, [x27]\n"
@@ -317,12 +317,12 @@ void sve_hybrid_s8qa_dot_4x4VL (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -330,7 +330,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"20:" // Height 2: input setup done
"cmp x25, #0x10\n"
"ble 23f\n"
@@ -339,56 +339,56 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "sdot z21.s, z5.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "sdot z23.s, z7.b, z1.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[0]\n"
+ "sdot z20.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z26.b, z0.b[0]\n"
+ "sdot z21.s, z26.b, z1.b[0]\n"
+ "sdot z18.s, z24.b, z0.b[0]\n"
+ "sdot z22.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "sdot z19.s, z25.b, z0.b[0]\n"
+ "sdot z23.s, z25.b, z1.b[0]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "sdot z20.s, z8.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "sdot z21.s, z9.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "sdot z22.s, z10.b, z1.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
- "sdot z23.s, z4.b, z1.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "sdot z20.s, z5.b, z1.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[1]\n"
+ "sdot z20.s, z24.b, z1.b[1]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "sdot z17.s, z27.b, z0.b[1]\n"
+ "sdot z21.s, z27.b, z1.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "sdot z18.s, z26.b, z0.b[1]\n"
+ "sdot z22.s, z26.b, z1.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "sdot z19.s, z25.b, z0.b[1]\n"
+ "sdot z23.s, z25.b, z1.b[1]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[2]\n"
+ "sdot z20.s, z24.b, z1.b[2]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
"add x23, x23, #0x10\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "sdot z21.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z22.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
- "sdot z23.s, z8.b, z1.b[2]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z20.s, z9.b, z1.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "sdot z21.s, z10.b, z1.b[3]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z22.s, z4.b, z1.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z17.s, z30.b, z0.b[2]\n"
+ "sdot z21.s, z30.b, z1.b[2]\n"
+ "sdot z18.s, z29.b, z0.b[2]\n"
+ "sdot z22.s, z29.b, z1.b[2]\n"
+ "sdot z19.s, z28.b, z0.b[2]\n"
+ "sdot z23.s, z28.b, z1.b[2]\n"
+ "sdot z16.s, z27.b, z0.b[3]\n"
+ "sdot z20.s, z27.b, z1.b[3]\n"
+ "sdot z17.s, z26.b, z0.b[3]\n"
+ "sdot z21.s, z26.b, z1.b[3]\n"
+ "sdot z18.s, z25.b, z0.b[3]\n"
+ "sdot z22.s, z25.b, z1.b[3]\n"
+ "sdot z19.s, z24.b, z0.b[3]\n"
+ "sdot z23.s, z24.b, z1.b[3]\n"
"tbnz %x[flags], #31, 22f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
@@ -401,63 +401,63 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x25, x25, #0x4\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "sdot z21.s, z5.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[0]\n"
+ "sdot z20.s, z24.b, z1.b[0]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z26.b, z0.b[0]\n"
+ "sdot z21.s, z26.b, z1.b[0]\n"
+ "sdot z18.s, z25.b, z0.b[0]\n"
+ "sdot z22.s, z25.b, z1.b[0]\n"
"addvl x28, x28, #4\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z24.b, z0.b[0]\n"
+ "sdot z23.s, z24.b, z1.b[0]\n"
"ble 24f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[1]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "sdot z21.s, z9.b, z1.b[1]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z16.s, z27.b, z0.b[1]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z20.s, z27.b, z1.b[1]\n"
+ "sdot z17.s, z26.b, z0.b[1]\n"
+ "sdot z21.s, z26.b, z1.b[1]\n"
+ "sdot z18.s, z25.b, z0.b[1]\n"
"addvl x28, x28, #4\n"
- "sdot z22.s, z10.b, z1.b[1]\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
- "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z22.s, z25.b, z1.b[1]\n"
+ "sdot z19.s, z24.b, z0.b[1]\n"
+ "sdot z23.s, z24.b, z1.b[1]\n"
"ble 24f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z20.s, z5.b, z1.b[2]\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "sdot z21.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z16.s, z27.b, z0.b[2]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z20.s, z27.b, z1.b[2]\n"
+ "sdot z17.s, z26.b, z0.b[2]\n"
+ "sdot z21.s, z26.b, z1.b[2]\n"
+ "sdot z18.s, z25.b, z0.b[2]\n"
"addvl x28, x28, #4\n"
- "sdot z22.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
- "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z22.s, z25.b, z1.b[2]\n"
+ "sdot z19.s, z24.b, z0.b[2]\n"
+ "sdot z23.s, z24.b, z1.b[2]\n"
"ble 24f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z20.s, z9.b, z1.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "sdot z21.s, z10.b, z1.b[3]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z22.s, z4.b, z1.b[3]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z24.b, z0.b[3]\n"
+ "sdot z20.s, z24.b, z1.b[3]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z26.b, z0.b[3]\n"
+ "sdot z21.s, z26.b, z1.b[3]\n"
+ "sdot z18.s, z25.b, z0.b[3]\n"
+ "sdot z22.s, z25.b, z1.b[3]\n"
"addvl x28, x28, #4\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z19.s, z24.b, z0.b[3]\n"
+ "sdot z23.s, z24.b, z1.b[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 25f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -473,120 +473,120 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z2.s }, p2/Z, [x20]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
"saddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"saddv d12, p0, z12.s\n"
- "neg z2.s, p2/M, z2.s\n"
+ "neg z24.s, p2/M, z24.s\n"
"mov z12.s, z12.s[0]\n"
- "mul z11.s, p2/M, z11.s, z2.s\n"
- "mul z12.s, p2/M, z12.s, z2.s\n"
+ "mul z11.s, p2/M, z11.s, z24.s\n"
+ "mul z12.s, p2/M, z12.s, z24.s\n"
"26:" // Height 2: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x10]\n"
+ "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z20.s, z20.s, z12.s\n"
"add z21.s, z21.s, z12.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z22.s, z22.s, z12.s\n"
"add z23.s, z23.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
+ "add z16.s, z16.s, z28.s\n"
+ "add z17.s, z17.s, z27.s\n"
"addvl x10, x10, #4\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
+ "add z18.s, z18.s, z26.s\n"
+ "add z19.s, z19.s, z25.s\n"
+ "add z20.s, z20.s, z28.s\n"
+ "add z21.s, z21.s, z27.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ "add z22.s, z22.s, z26.s\n"
+ "add z23.s, z23.s, z25.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
+ ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
+ ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
+ ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n"
+ ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
- "and z4.d, z16.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "and z8.d, z20.d, z0.d\n"
- "and z9.d, z21.d, z0.d\n"
- "and z10.d, z22.d, z0.d\n"
- "and z4.d, z23.d, z0.d\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "sqadd z20.s, z20.s, z8.s\n"
- "sqadd z21.s, z21.s, z9.s\n"
- "sqadd z22.s, z22.s, z10.s\n"
- "sqadd z23.s, z23.s, z4.s\n"
+ "and z24.d, z16.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z24.s\n"
+ "and z30.d, z17.d, z0.d\n"
+ "and z29.d, z18.d, z0.d\n"
+ "and z28.d, z19.d, z0.d\n"
+ "and z27.d, z20.d, z0.d\n"
+ "and z26.d, z21.d, z0.d\n"
+ "and z25.d, z22.d, z0.d\n"
+ "and z24.d, z23.d, z0.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z30.s\n"
+ "sqadd z18.s, z18.s, z29.s\n"
+ "sqadd z19.s, z19.s, z28.s\n"
+ "sqadd z20.s, z20.s, z27.s\n"
+ "sqadd z21.s, z21.s, z26.s\n"
+ "sqadd z22.s, z22.s, z25.s\n"
+ "sqadd z23.s, z23.s, z24.s\n"
"27:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add z20.s, z20.s, z24.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z24.s\n"
+ "add z22.s, z22.s, z24.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z4.s\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z24.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z25.s\n"
+ "smin z17.s, p2/M, z17.s, z25.s\n"
+ "smin z18.s, p2/M, z18.s, z25.s\n"
+ "smin z19.s, p2/M, z19.s, z25.s\n"
+ "smin z20.s, p2/M, z20.s, z25.s\n"
+ "smin z21.s, p2/M, z21.s, z25.s\n"
+ "smin z22.s, p2/M, z22.s, z25.s\n"
+ "smin z23.s, p2/M, z23.s, z25.s\n"
+ "smax z16.s, p2/M, z16.s, z24.s\n"
+ "smax z17.s, p2/M, z17.s, z24.s\n"
+ "smax z18.s, p2/M, z18.s, z24.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z24.s\n"
+ "smax z20.s, p2/M, z20.s, z24.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z24.s\n"
+ "smax z22.s, p2/M, z22.s, z24.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x27]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
+ "smax z23.s, p2/M, z23.s, z24.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"st1b { z20.b }, p1, [x23]\n"
"addvl x27, x27, #1\n"
"28:" // Height 2: Writeback done
@@ -624,13 +624,13 @@ void sve_hybrid_s8qa_dot_4x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -639,8 +639,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"34:" // Height 3: input setup done
"cmp x25, #0x10\n"
"ble 37f\n"
@@ -650,73 +650,73 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "sdot z24.s, z4.b, z2.b[0]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z21.s, z5.b, z1.b[0]\n"
- "sdot z25.s, z5.b, z2.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "sdot z26.s, z6.b, z2.b[0]\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z28.b, z0.b[0]\n"
+ "sdot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z28.b, z2.b[0]\n"
+ "sdot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z21.s, z30.b, z1.b[0]\n"
+ "sdot z25.s, z30.b, z2.b[0]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "sdot z18.s, z29.b, z0.b[0]\n"
+ "sdot z22.s, z29.b, z1.b[0]\n"
+ "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "sdot z26.s, z29.b, z2.b[0]\n"
+ "sdot z19.s, z28.b, z0.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "sdot z23.s, z7.b, z1.b[0]\n"
- "sdot z27.s, z7.b, z2.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "sdot z20.s, z8.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "sdot z23.s, z28.b, z1.b[0]\n"
+ "sdot z27.s, z28.b, z2.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "sdot z16.s, z3.b, z0.b[1]\n"
+ "sdot z20.s, z3.b, z1.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
"add x23, x23, #0x10\n"
- "sdot z24.s, z8.b, z2.b[1]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "sdot z24.s, z3.b, z2.b[1]\n"
+ "sdot z17.s, z31.b, z0.b[1]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
"add x22, x22, #0x10\n"
- "sdot z21.s, z9.b, z1.b[1]\n"
- "sdot z25.s, z9.b, z2.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "sdot z22.s, z10.b, z1.b[1]\n"
- "sdot z26.s, z10.b, z2.b[1]\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "sdot z23.s, z4.b, z1.b[1]\n"
- "sdot z27.s, z4.b, z2.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "sdot z20.s, z5.b, z1.b[2]\n"
- "sdot z24.s, z5.b, z2.b[2]\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "sdot z21.s, z6.b, z1.b[2]\n"
- "sdot z25.s, z6.b, z2.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z22.s, z7.b, z1.b[2]\n"
- "sdot z26.s, z7.b, z2.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
- "sdot z23.s, z8.b, z1.b[2]\n"
- "sdot z27.s, z8.b, z2.b[2]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z20.s, z9.b, z1.b[3]\n"
- "sdot z24.s, z9.b, z2.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "sdot z21.s, z10.b, z1.b[3]\n"
- "sdot z25.s, z10.b, z2.b[3]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z22.s, z4.b, z1.b[3]\n"
- "sdot z26.s, z4.b, z2.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z23.s, z5.b, z1.b[3]\n"
- "sdot z27.s, z5.b, z2.b[3]\n"
+ "sdot z21.s, z31.b, z1.b[1]\n"
+ "sdot z25.s, z31.b, z2.b[1]\n"
+ "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "sdot z18.s, z30.b, z0.b[1]\n"
+ "sdot z22.s, z30.b, z1.b[1]\n"
+ "sdot z26.s, z30.b, z2.b[1]\n"
+ "sdot z19.s, z29.b, z0.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "sdot z23.s, z29.b, z1.b[1]\n"
+ "sdot z27.s, z29.b, z2.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "sdot z16.s, z28.b, z0.b[2]\n"
+ "sdot z20.s, z28.b, z1.b[2]\n"
+ "sdot z24.s, z28.b, z2.b[2]\n"
+ "sdot z17.s, z5.b, z0.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "sdot z21.s, z5.b, z1.b[2]\n"
+ "sdot z25.s, z5.b, z2.b[2]\n"
+ "sdot z18.s, z4.b, z0.b[2]\n"
+ "sdot z22.s, z4.b, z1.b[2]\n"
+ "sdot z26.s, z4.b, z2.b[2]\n"
+ "sdot z19.s, z3.b, z0.b[2]\n"
+ "sdot z23.s, z3.b, z1.b[2]\n"
+ "sdot z27.s, z3.b, z2.b[2]\n"
+ "sdot z16.s, z31.b, z0.b[3]\n"
+ "sdot z20.s, z31.b, z1.b[3]\n"
+ "sdot z24.s, z31.b, z2.b[3]\n"
+ "sdot z17.s, z30.b, z0.b[3]\n"
+ "sdot z21.s, z30.b, z1.b[3]\n"
+ "sdot z25.s, z30.b, z2.b[3]\n"
+ "sdot z18.s, z29.b, z0.b[3]\n"
+ "sdot z22.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z2.b[3]\n"
+ "sdot z19.s, z28.b, z0.b[3]\n"
+ "sdot z23.s, z28.b, z1.b[3]\n"
+ "sdot z27.s, z28.b, z2.b[3]\n"
"tbnz %x[flags], #31, 36f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
@@ -731,79 +731,79 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "sdot z24.s, z4.b, z2.b[0]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z21.s, z5.b, z1.b[0]\n"
- "sdot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z28.b, z0.b[0]\n"
+ "sdot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z28.b, z2.b[0]\n"
+ "sdot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z21.s, z30.b, z1.b[0]\n"
+ "sdot z25.s, z30.b, z2.b[0]\n"
"addvl x28, x28, #4\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z22.s, z6.b, z1.b[0]\n"
- "sdot z26.s, z6.b, z2.b[0]\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "sdot z23.s, z7.b, z1.b[0]\n"
- "sdot z27.s, z7.b, z2.b[0]\n"
+ "sdot z18.s, z29.b, z0.b[0]\n"
+ "sdot z22.s, z29.b, z1.b[0]\n"
+ "sdot z26.s, z29.b, z2.b[0]\n"
+ "sdot z19.s, z28.b, z0.b[0]\n"
+ "sdot z23.s, z28.b, z1.b[0]\n"
+ "sdot z27.s, z28.b, z2.b[0]\n"
"ble 38f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z31.b }, p2/Z, [x28]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[1]\n"
- "sdot z24.s, z8.b, z2.b[1]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z16.s, z31.b, z0.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z20.s, z31.b, z1.b[1]\n"
+ "sdot z24.s, z31.b, z2.b[1]\n"
+ "sdot z17.s, z30.b, z0.b[1]\n"
+ "sdot z21.s, z30.b, z1.b[1]\n"
"addvl x28, x28, #4\n"
- "sdot z25.s, z9.b, z2.b[1]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "sdot z22.s, z10.b, z1.b[1]\n"
- "sdot z26.s, z10.b, z2.b[1]\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
- "sdot z23.s, z4.b, z1.b[1]\n"
- "sdot z27.s, z4.b, z2.b[1]\n"
+ "sdot z25.s, z30.b, z2.b[1]\n"
+ "sdot z18.s, z29.b, z0.b[1]\n"
+ "sdot z22.s, z29.b, z1.b[1]\n"
+ "sdot z26.s, z29.b, z2.b[1]\n"
+ "sdot z19.s, z28.b, z0.b[1]\n"
+ "sdot z23.s, z28.b, z1.b[1]\n"
+ "sdot z27.s, z28.b, z2.b[1]\n"
"ble 38f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z31.b }, p2/Z, [x28]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z20.s, z5.b, z1.b[2]\n"
- "sdot z24.s, z5.b, z2.b[2]\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z31.b, z0.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z20.s, z31.b, z1.b[2]\n"
+ "sdot z24.s, z31.b, z2.b[2]\n"
+ "sdot z17.s, z30.b, z0.b[2]\n"
+ "sdot z21.s, z30.b, z1.b[2]\n"
"addvl x28, x28, #4\n"
- "sdot z25.s, z6.b, z2.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z22.s, z7.b, z1.b[2]\n"
- "sdot z26.s, z7.b, z2.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
- "sdot z23.s, z8.b, z1.b[2]\n"
- "sdot z27.s, z8.b, z2.b[2]\n"
+ "sdot z25.s, z30.b, z2.b[2]\n"
+ "sdot z18.s, z29.b, z0.b[2]\n"
+ "sdot z22.s, z29.b, z1.b[2]\n"
+ "sdot z26.s, z29.b, z2.b[2]\n"
+ "sdot z19.s, z28.b, z0.b[2]\n"
+ "sdot z23.s, z28.b, z1.b[2]\n"
+ "sdot z27.s, z28.b, z2.b[2]\n"
"ble 38f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z20.s, z9.b, z1.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z24.s, z9.b, z2.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "sdot z21.s, z10.b, z1.b[3]\n"
- "sdot z25.s, z10.b, z2.b[3]\n"
+ "ld1b { z31.b }, p2/Z, [x28]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z31.b, z0.b[3]\n"
+ "sdot z20.s, z31.b, z1.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z24.s, z31.b, z2.b[3]\n"
+ "sdot z17.s, z30.b, z0.b[3]\n"
+ "sdot z21.s, z30.b, z1.b[3]\n"
+ "sdot z25.s, z30.b, z2.b[3]\n"
"addvl x28, x28, #4\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z22.s, z4.b, z1.b[3]\n"
- "sdot z26.s, z4.b, z2.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z23.s, z5.b, z1.b[3]\n"
- "sdot z27.s, z5.b, z2.b[3]\n"
+ "sdot z18.s, z29.b, z0.b[3]\n"
+ "sdot z22.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z2.b[3]\n"
+ "sdot z19.s, z28.b, z0.b[3]\n"
+ "sdot z23.s, z28.b, z1.b[3]\n"
+ "sdot z27.s, z28.b, z2.b[3]\n"
"38:" // Height 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 39f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -821,33 +821,33 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z3.s }, p2/Z, [x20]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
"saddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"saddv d12, p0, z12.s\n"
"saddv d13, p0, z13.s\n"
"mov z12.s, z12.s[0]\n"
"mov z13.s, z13.s[0]\n"
- "neg z3.s, p2/M, z3.s\n"
- "mul z11.s, p2/M, z11.s, z3.s\n"
- "mul z12.s, p2/M, z12.s, z3.s\n"
- "mul z13.s, p2/M, z13.s, z3.s\n"
+ "neg z28.s, p2/M, z28.s\n"
+ "mul z11.s, p2/M, z11.s, z28.s\n"
+ "mul z12.s, p2/M, z12.s, z28.s\n"
+ "mul z13.s, p2/M, z13.s, z28.s\n"
"40:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
"ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z20.s, z20.s, z12.s\n"
"add z21.s, z21.s, z12.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z22.s, z22.s, z12.s\n"
"add z23.s, z23.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add z24.s, z24.s, z13.s\n"
"add z25.s, z25.s, z13.s\n"
@@ -855,133 +855,133 @@ void sve_hybrid_s8qa_dot_4x4VL (
"add z26.s, z26.s, z13.s\n"
"add z27.s, z27.s, z13.s\n"
"add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "add z17.s, z17.s, z31.s\n"
+ "add z18.s, z18.s, z30.s\n"
+ "add z19.s, z19.s, z29.s\n"
"add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z22.s, z22.s, z30.s\n"
+ "add z23.s, z23.s, z29.s\n"
"add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
+ "add z25.s, z25.s, z31.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "add z26.s, z26.s, z30.s\n"
+ "add z27.s, z27.s, z29.s\n"
+ ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n"
+ ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n"
+ ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n"
+ ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n"
+ ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n"
+ ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n"
+ ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n"
+ ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n"
+ ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n"
+ ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n"
+ ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n"
+ ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n"
"tbz %x[flags], #5, 41f\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "and z8.d, z20.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "sqadd z20.s, z20.s, z8.s\n"
- "and z9.d, z21.d, z0.d\n"
- "and z10.d, z22.d, z0.d\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z24.d, z0.d\n"
- "and z6.d, z25.d, z0.d\n"
- "and z7.d, z26.d, z0.d\n"
- "and z8.d, z27.d, z0.d\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z21.s, z21.s, z9.s\n"
- "sqadd z22.s, z22.s, z10.s\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z24.s, z24.s, z5.s\n"
- "sqadd z25.s, z25.s, z6.s\n"
- "sqadd z26.s, z26.s, z7.s\n"
- "sqadd z27.s, z27.s, z8.s\n"
+ "and z1.d, z16.d, z0.d\n"
+ "and z31.d, z17.d, z0.d\n"
+ "and z30.d, z18.d, z0.d\n"
+ "and z29.d, z19.d, z0.d\n"
+ "and z28.d, z20.d, z0.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z1.s\n"
+ "sqadd z17.s, z17.s, z31.s\n"
+ "sqadd z18.s, z18.s, z30.s\n"
+ "sqadd z19.s, z19.s, z29.s\n"
+ "sqadd z20.s, z20.s, z28.s\n"
+ "and z3.d, z21.d, z0.d\n"
+ "and z2.d, z22.d, z0.d\n"
+ "and z1.d, z23.d, z0.d\n"
+ "and z31.d, z24.d, z0.d\n"
+ "and z30.d, z25.d, z0.d\n"
+ "and z29.d, z26.d, z0.d\n"
+ "and z28.d, z27.d, z0.d\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z3.s\n"
+ "sqadd z22.s, z22.s, z2.s\n"
+ "sqadd z23.s, z23.s, z1.s\n"
+ "sqadd z24.s, z24.s, z31.s\n"
+ "sqadd z25.s, z25.s, z30.s\n"
+ "sqadd z26.s, z26.s, z29.s\n"
+ "sqadd z27.s, z27.s, z28.s\n"
"41:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z28.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z28.s\n"
+ "add z18.s, z18.s, z28.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z20.s, z20.s, z28.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z28.s\n"
+ "add z22.s, z22.s, z28.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z24.s, z24.s, z28.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z28.s\n"
+ "add z26.s, z26.s, z28.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z4.s\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z28.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z29.s\n"
+ "smin z17.s, p2/M, z17.s, z29.s\n"
+ "smin z18.s, p2/M, z18.s, z29.s\n"
+ "smin z19.s, p2/M, z19.s, z29.s\n"
+ "smin z20.s, p2/M, z20.s, z29.s\n"
+ "smin z21.s, p2/M, z21.s, z29.s\n"
+ "smin z22.s, p2/M, z22.s, z29.s\n"
+ "smin z23.s, p2/M, z23.s, z29.s\n"
+ "smin z24.s, p2/M, z24.s, z29.s\n"
+ "smin z25.s, p2/M, z25.s, z29.s\n"
+ "smin z26.s, p2/M, z26.s, z29.s\n"
+ "smin z27.s, p2/M, z27.s, z29.s\n"
+ "smax z16.s, p2/M, z16.s, z28.s\n"
+ "smax z17.s, p2/M, z17.s, z28.s\n"
+ "smax z18.s, p2/M, z18.s, z28.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z28.s\n"
+ "smax z20.s, p2/M, z20.s, z28.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z28.s\n"
+ "smax z22.s, p2/M, z22.s, z28.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x27]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z28.s\n"
+ "smax z24.s, p2/M, z24.s, z28.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z28.s\n"
+ "smax z26.s, p2/M, z26.s, z28.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
"st1b { z20.b }, p1, [x23]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
+ "smax z27.s, p2/M, z27.s, z28.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x22]\n"
"addvl x27, x27, #1\n"
"42:" // Height 3: Writeback done
@@ -1027,14 +1027,14 @@ void sve_hybrid_s8qa_dot_4x4VL (
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1044,9 +1044,9 @@ void sve_hybrid_s8qa_dot_4x4VL (
"b 48f\n"
"47:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"48:" // Height 4: input setup done
"cmp x25, #0x10\n"
"ble 51f\n"
@@ -1059,88 +1059,88 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ld1rqb { z3.b }, p0/Z, [x21]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z24.s, z4.b, z2.b[0]\n"
- "sdot z28.s, z4.b, z3.b[0]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[0]\n"
+ "sdot z20.s, z5.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z24.s, z5.b, z2.b[0]\n"
+ "sdot z28.s, z5.b, z3.b[0]\n"
+ "sdot z17.s, z4.b, z0.b[0]\n"
+ "sdot z21.s, z4.b, z1.b[0]\n"
"ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "sdot z25.s, z5.b, z2.b[0]\n"
- "sdot z29.s, z5.b, z3.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "sdot z25.s, z4.b, z2.b[0]\n"
+ "sdot z29.s, z4.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"addvl x28, x28, #16\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "sdot z26.s, z6.b, z2.b[0]\n"
- "sdot z30.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
"add x21, x21, #0x10\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "sdot z23.s, z7.b, z1.b[0]\n"
- "sdot z27.s, z7.b, z2.b[0]\n"
- "sdot z31.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "sdot z27.s, z9.b, z2.b[0]\n"
+ "sdot z31.s, z9.b, z3.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z16.s, z8.b, z0.b[1]\n"
"sdot z20.s, z8.b, z1.b[1]\n"
"sdot z24.s, z8.b, z2.b[1]\n"
"sdot z28.s, z8.b, z3.b[1]\n"
"ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
- "sdot z21.s, z9.b, z1.b[1]\n"
- "sdot z25.s, z9.b, z2.b[1]\n"
- "sdot z29.s, z9.b, z3.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "sdot z22.s, z10.b, z1.b[1]\n"
- "sdot z26.s, z10.b, z2.b[1]\n"
- "sdot z30.s, z10.b, z3.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "sdot z19.s, z4.b, z0.b[1]\n"
- "sdot z23.s, z4.b, z1.b[1]\n"
- "sdot z27.s, z4.b, z2.b[1]\n"
- "sdot z31.s, z4.b, z3.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "sdot z20.s, z5.b, z1.b[2]\n"
- "sdot z24.s, z5.b, z2.b[2]\n"
- "sdot z28.s, z5.b, z3.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "sdot z17.s, z6.b, z0.b[2]\n"
- "sdot z21.s, z6.b, z1.b[2]\n"
- "sdot z25.s, z6.b, z2.b[2]\n"
- "sdot z29.s, z6.b, z3.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z22.s, z7.b, z1.b[2]\n"
- "sdot z26.s, z7.b, z2.b[2]\n"
- "sdot z30.s, z7.b, z3.b[2]\n"
+ "sdot z17.s, z7.b, z0.b[1]\n"
+ "sdot z21.s, z7.b, z1.b[1]\n"
+ "sdot z25.s, z7.b, z2.b[1]\n"
+ "sdot z29.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[1]\n"
+ "sdot z22.s, z6.b, z1.b[1]\n"
+ "sdot z26.s, z6.b, z2.b[1]\n"
+ "sdot z30.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "sdot z19.s, z5.b, z0.b[1]\n"
+ "sdot z23.s, z5.b, z1.b[1]\n"
+ "sdot z27.s, z5.b, z2.b[1]\n"
+ "sdot z31.s, z5.b, z3.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "sdot z16.s, z4.b, z0.b[2]\n"
+ "sdot z20.s, z4.b, z1.b[2]\n"
+ "sdot z24.s, z4.b, z2.b[2]\n"
+ "sdot z28.s, z4.b, z3.b[2]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "sdot z17.s, z10.b, z0.b[2]\n"
+ "sdot z21.s, z10.b, z1.b[2]\n"
+ "sdot z25.s, z10.b, z2.b[2]\n"
+ "sdot z29.s, z10.b, z3.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z26.s, z9.b, z2.b[2]\n"
+ "sdot z30.s, z9.b, z3.b[2]\n"
"sdot z19.s, z8.b, z0.b[2]\n"
"sdot z23.s, z8.b, z1.b[2]\n"
"sdot z27.s, z8.b, z2.b[2]\n"
"sdot z31.s, z8.b, z3.b[2]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z20.s, z9.b, z1.b[3]\n"
- "sdot z24.s, z9.b, z2.b[3]\n"
- "sdot z28.s, z9.b, z3.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "sdot z21.s, z10.b, z1.b[3]\n"
- "sdot z25.s, z10.b, z2.b[3]\n"
- "sdot z29.s, z10.b, z3.b[3]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z22.s, z4.b, z1.b[3]\n"
- "sdot z26.s, z4.b, z2.b[3]\n"
- "sdot z30.s, z4.b, z3.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z23.s, z5.b, z1.b[3]\n"
- "sdot z27.s, z5.b, z2.b[3]\n"
- "sdot z31.s, z5.b, z3.b[3]\n"
+ "sdot z16.s, z7.b, z0.b[3]\n"
+ "sdot z20.s, z7.b, z1.b[3]\n"
+ "sdot z24.s, z7.b, z2.b[3]\n"
+ "sdot z28.s, z7.b, z3.b[3]\n"
+ "sdot z17.s, z6.b, z0.b[3]\n"
+ "sdot z21.s, z6.b, z1.b[3]\n"
+ "sdot z25.s, z6.b, z2.b[3]\n"
+ "sdot z29.s, z6.b, z3.b[3]\n"
+ "sdot z18.s, z5.b, z0.b[3]\n"
+ "sdot z22.s, z5.b, z1.b[3]\n"
+ "sdot z26.s, z5.b, z2.b[3]\n"
+ "sdot z30.s, z5.b, z3.b[3]\n"
+ "sdot z19.s, z4.b, z0.b[3]\n"
+ "sdot z23.s, z4.b, z1.b[3]\n"
+ "sdot z27.s, z4.b, z2.b[3]\n"
+ "sdot z31.s, z4.b, z3.b[3]\n"
"tbnz %x[flags], #31, 50f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
@@ -1157,95 +1157,95 @@ void sve_hybrid_s8qa_dot_4x4VL (
"subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
"ld1rqb { z3.b }, p0/Z, [x21]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z4.b, z0.b[0]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z24.s, z4.b, z2.b[0]\n"
- "sdot z28.s, z4.b, z3.b[0]\n"
- "sdot z17.s, z5.b, z0.b[0]\n"
- "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z7.b, z0.b[0]\n"
+ "sdot z20.s, z7.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[0]\n"
+ "sdot z28.s, z7.b, z3.b[0]\n"
+ "sdot z17.s, z6.b, z0.b[0]\n"
+ "sdot z21.s, z6.b, z1.b[0]\n"
"addvl x28, x28, #4\n"
- "sdot z25.s, z5.b, z2.b[0]\n"
- "sdot z29.s, z5.b, z3.b[0]\n"
- "sdot z18.s, z6.b, z0.b[0]\n"
- "sdot z22.s, z6.b, z1.b[0]\n"
- "sdot z26.s, z6.b, z2.b[0]\n"
- "sdot z30.s, z6.b, z3.b[0]\n"
- "sdot z19.s, z7.b, z0.b[0]\n"
- "sdot z23.s, z7.b, z1.b[0]\n"
- "sdot z27.s, z7.b, z2.b[0]\n"
- "sdot z31.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z6.b, z2.b[0]\n"
+ "sdot z29.s, z6.b, z3.b[0]\n"
+ "sdot z18.s, z5.b, z0.b[0]\n"
+ "sdot z22.s, z5.b, z1.b[0]\n"
+ "sdot z26.s, z5.b, z2.b[0]\n"
+ "sdot z30.s, z5.b, z3.b[0]\n"
+ "sdot z19.s, z4.b, z0.b[0]\n"
+ "sdot z23.s, z4.b, z1.b[0]\n"
+ "sdot z27.s, z4.b, z2.b[0]\n"
+ "sdot z31.s, z4.b, z3.b[0]\n"
"ble 52f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z16.s, z7.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[1]\n"
- "sdot z24.s, z8.b, z2.b[1]\n"
- "sdot z28.s, z8.b, z3.b[1]\n"
- "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z20.s, z7.b, z1.b[1]\n"
+ "sdot z24.s, z7.b, z2.b[1]\n"
+ "sdot z28.s, z7.b, z3.b[1]\n"
+ "sdot z17.s, z6.b, z0.b[1]\n"
"addvl x28, x28, #4\n"
- "sdot z21.s, z9.b, z1.b[1]\n"
- "sdot z25.s, z9.b, z2.b[1]\n"
- "sdot z29.s, z9.b, z3.b[1]\n"
- "sdot z18.s, z10.b, z0.b[1]\n"
- "sdot z22.s, z10.b, z1.b[1]\n"
- "sdot z26.s, z10.b, z2.b[1]\n"
- "sdot z30.s, z10.b, z3.b[1]\n"
+ "sdot z21.s, z6.b, z1.b[1]\n"
+ "sdot z25.s, z6.b, z2.b[1]\n"
+ "sdot z29.s, z6.b, z3.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z2.b[1]\n"
+ "sdot z30.s, z5.b, z3.b[1]\n"
"sdot z19.s, z4.b, z0.b[1]\n"
"sdot z23.s, z4.b, z1.b[1]\n"
"sdot z27.s, z4.b, z2.b[1]\n"
"sdot z31.s, z4.b, z3.b[1]\n"
"ble 52f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z20.s, z5.b, z1.b[2]\n"
- "sdot z24.s, z5.b, z2.b[2]\n"
- "sdot z28.s, z5.b, z3.b[2]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "sdot z24.s, z7.b, z2.b[2]\n"
+ "sdot z28.s, z7.b, z3.b[2]\n"
"sdot z17.s, z6.b, z0.b[2]\n"
"addvl x28, x28, #4\n"
"sdot z21.s, z6.b, z1.b[2]\n"
"sdot z25.s, z6.b, z2.b[2]\n"
"sdot z29.s, z6.b, z3.b[2]\n"
- "sdot z18.s, z7.b, z0.b[2]\n"
- "sdot z22.s, z7.b, z1.b[2]\n"
- "sdot z26.s, z7.b, z2.b[2]\n"
- "sdot z30.s, z7.b, z3.b[2]\n"
- "sdot z19.s, z8.b, z0.b[2]\n"
- "sdot z23.s, z8.b, z1.b[2]\n"
- "sdot z27.s, z8.b, z2.b[2]\n"
- "sdot z31.s, z8.b, z3.b[2]\n"
+ "sdot z18.s, z5.b, z0.b[2]\n"
+ "sdot z22.s, z5.b, z1.b[2]\n"
+ "sdot z26.s, z5.b, z2.b[2]\n"
+ "sdot z30.s, z5.b, z3.b[2]\n"
+ "sdot z19.s, z4.b, z0.b[2]\n"
+ "sdot z23.s, z4.b, z1.b[2]\n"
+ "sdot z27.s, z4.b, z2.b[2]\n"
+ "sdot z31.s, z4.b, z3.b[2]\n"
"ble 52f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "sdot z16.s, z9.b, z0.b[3]\n"
- "sdot z20.s, z9.b, z1.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "sdot z24.s, z9.b, z2.b[3]\n"
- "sdot z28.s, z9.b, z3.b[3]\n"
- "sdot z17.s, z10.b, z0.b[3]\n"
- "sdot z21.s, z10.b, z1.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z16.s, z7.b, z0.b[3]\n"
+ "sdot z20.s, z7.b, z1.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[3]\n"
+ "sdot z28.s, z7.b, z3.b[3]\n"
+ "sdot z17.s, z6.b, z0.b[3]\n"
+ "sdot z21.s, z6.b, z1.b[3]\n"
"addvl x28, x28, #4\n"
- "sdot z25.s, z10.b, z2.b[3]\n"
- "sdot z29.s, z10.b, z3.b[3]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z22.s, z4.b, z1.b[3]\n"
- "sdot z26.s, z4.b, z2.b[3]\n"
- "sdot z30.s, z4.b, z3.b[3]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z23.s, z5.b, z1.b[3]\n"
- "sdot z27.s, z5.b, z2.b[3]\n"
- "sdot z31.s, z5.b, z3.b[3]\n"
+ "sdot z25.s, z6.b, z2.b[3]\n"
+ "sdot z29.s, z6.b, z3.b[3]\n"
+ "sdot z18.s, z5.b, z0.b[3]\n"
+ "sdot z22.s, z5.b, z1.b[3]\n"
+ "sdot z26.s, z5.b, z2.b[3]\n"
+ "sdot z30.s, z5.b, z3.b[3]\n"
+ "sdot z19.s, z4.b, z0.b[3]\n"
+ "sdot z23.s, z4.b, z1.b[3]\n"
+ "sdot z27.s, z4.b, z2.b[3]\n"
+ "sdot z31.s, z4.b, z3.b[3]\n"
"52:" // Height 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 53f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -1265,7 +1265,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
"saddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"saddv d12, p0, z12.s\n"
@@ -1273,28 +1273,28 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov z12.s, z12.s[0]\n"
"mov z13.s, z13.s[0]\n"
"saddv d14, p0, z14.s\n"
- "neg z4.s, p2/M, z4.s\n"
+ "neg z0.s, p2/M, z0.s\n"
"mov z14.s, z14.s[0]\n"
- "mul z11.s, p2/M, z11.s, z4.s\n"
- "mul z12.s, p2/M, z12.s, z4.s\n"
- "mul z13.s, p2/M, z13.s, z4.s\n"
- "mul z14.s, p2/M, z14.s, z4.s\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
+ "mul z12.s, p2/M, z12.s, z0.s\n"
+ "mul z13.s, p2/M, z13.s, z0.s\n"
+ "mul z14.s, p2/M, z14.s, z0.s\n"
"54:" // Height 4: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z20.s, z20.s, z12.s\n"
"add z21.s, z21.s, z12.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z22.s, z22.s, z12.s\n"
"add z23.s, z23.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add z24.s, z24.s, z13.s\n"
"add z25.s, z25.s, z13.s\n"
@@ -1305,174 +1305,174 @@ void sve_hybrid_s8qa_dot_4x4VL (
"add z29.s, z29.s, z14.s\n"
"add z30.s, z30.s, z14.s\n"
"add z31.s, z31.s, z14.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- "add z28.s, z28.s, z0.s\n"
- "add z29.s, z29.s, z1.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z0.s\n"
+ "add z18.s, z18.s, z3.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "add z21.s, z21.s, z0.s\n"
+ "add z22.s, z22.s, z3.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z0.s\n"
+ "add z26.s, z26.s, z3.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add z28.s, z28.s, z4.s\n"
+ "add z29.s, z29.s, z0.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z2.s\n"
- "add z31.s, z31.s, z3.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
- ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
- ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
- ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
- ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ "add z30.s, z30.s, z3.s\n"
+ "add z31.s, z31.s, z2.s\n"
+ ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n"
+ ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n"
+ ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n"
+ ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n"
+ ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n"
+ ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n"
+ ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n"
+ ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n"
+ ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n"
+ ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n"
"tbz %x[flags], #5, 55f\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "and z8.d, z20.d, z0.d\n"
- "and z9.d, z21.d, z0.d\n"
- "and z10.d, z22.d, z0.d\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z24.d, z0.d\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "and z2.d, z16.d, z0.d\n"
+ "and z1.d, z17.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z17.s, z17.s, z1.s\n"
+ "and z7.d, z18.d, z0.d\n"
+ "and z6.d, z19.d, z0.d\n"
+ "and z5.d, z20.d, z0.d\n"
+ "and z4.d, z21.d, z0.d\n"
+ "and z3.d, z22.d, z0.d\n"
+ "and z2.d, z23.d, z0.d\n"
+ "and z1.d, z24.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "sqadd z20.s, z20.s, z8.s\n"
- "sqadd z21.s, z21.s, z9.s\n"
- "sqadd z22.s, z22.s, z10.s\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z24.s, z24.s, z5.s\n"
- "and z6.d, z25.d, z0.d\n"
- "and z7.d, z26.d, z0.d\n"
- "and z8.d, z27.d, z0.d\n"
- "and z9.d, z28.d, z0.d\n"
- "and z10.d, z29.d, z0.d\n"
- "and z4.d, z30.d, z0.d\n"
- "and z5.d, z31.d, z0.d\n"
"asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z7.s\n"
+ "sqadd z19.s, z19.s, z6.s\n"
+ "sqadd z20.s, z20.s, z5.s\n"
+ "sqadd z21.s, z21.s, z4.s\n"
+ "sqadd z22.s, z22.s, z3.s\n"
+ "sqadd z23.s, z23.s, z2.s\n"
+ "sqadd z24.s, z24.s, z1.s\n"
+ "and z7.d, z25.d, z0.d\n"
+ "and z6.d, z26.d, z0.d\n"
+ "and z5.d, z27.d, z0.d\n"
+ "and z4.d, z28.d, z0.d\n"
+ "and z3.d, z29.d, z0.d\n"
+ "and z2.d, z30.d, z0.d\n"
+ "and z1.d, z31.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z25.s, z25.s, z6.s\n"
- "sqadd z26.s, z26.s, z7.s\n"
- "sqadd z27.s, z27.s, z8.s\n"
- "sqadd z28.s, z28.s, z9.s\n"
- "sqadd z29.s, z29.s, z10.s\n"
- "sqadd z30.s, z30.s, z4.s\n"
- "sqadd z31.s, z31.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z26.s, z26.s, z6.s\n"
+ "sqadd z27.s, z27.s, z5.s\n"
+ "sqadd z28.s, z28.s, z4.s\n"
+ "sqadd z29.s, z29.s, z3.s\n"
+ "sqadd z30.s, z30.s, z2.s\n"
+ "sqadd z31.s, z31.s, z1.s\n"
"55:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z20.s, z20.s, z2.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- "add z27.s, z27.s, z4.s\n"
- "add z28.s, z28.s, z4.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
- "add z29.s, z29.s, z4.s\n"
- "add z30.s, z30.s, z4.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z30.s, z30.s, z2.s\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z31.s, z31.s, z4.s\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add z31.s, z31.s, z2.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z1.s\n"
+ "smin z17.s, p2/M, z17.s, z1.s\n"
+ "smin z18.s, p2/M, z18.s, z1.s\n"
+ "smin z19.s, p2/M, z19.s, z1.s\n"
+ "smin z20.s, p2/M, z20.s, z1.s\n"
+ "smin z21.s, p2/M, z21.s, z1.s\n"
+ "smin z22.s, p2/M, z22.s, z1.s\n"
+ "smin z23.s, p2/M, z23.s, z1.s\n"
+ "smin z24.s, p2/M, z24.s, z1.s\n"
+ "smin z25.s, p2/M, z25.s, z1.s\n"
+ "smin z26.s, p2/M, z26.s, z1.s\n"
+ "smin z27.s, p2/M, z27.s, z1.s\n"
+ "smin z28.s, p2/M, z28.s, z1.s\n"
+ "smin z29.s, p2/M, z29.s, z1.s\n"
+ "smin z30.s, p2/M, z30.s, z1.s\n"
+ "smin z31.s, p2/M, z31.s, z1.s\n"
+ "smax z16.s, p2/M, z16.s, z0.s\n"
+ "smax z17.s, p2/M, z17.s, z0.s\n"
+ "smax z18.s, p2/M, z18.s, z0.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z0.s\n"
+ "smax z20.s, p2/M, z20.s, z0.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z0.s\n"
+ "smax z22.s, p2/M, z22.s, z0.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x27]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z0.s\n"
+ "smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z0.s\n"
+ "smax z26.s, p2/M, z26.s, z0.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
"st1b { z20.b }, p1, [x23]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "smax z28.s, p2/M, z28.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "smax z29.s, p2/M, z29.s, z5.s\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z0.s\n"
+ "smax z28.s, p2/M, z28.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "smax z29.s, p2/M, z29.s, z0.s\n"
+ "smax z30.s, p2/M, z30.s, z0.s\n"
"uzp1 z28.h, z28.h, z29.h\n"
"st1b { z24.b }, p1, [x22]\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
+ "smax z31.s, p2/M, z31.s, z0.s\n"
+ "uzp1 z16.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z16.b\n"
"st1b { z28.b }, p1, [x21]\n"
"addvl x27, x27, #1\n"
"56:" // Height 4: Writeback done
@@ -1491,7 +1491,6 @@ void sve_hybrid_s8qa_dot_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1499,4 +1498,4 @@ void sve_hybrid_s8qa_dot_4x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
index 9681505e8c..ae922e9743 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, int8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -97,5 +96,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
index 626a06b26b..e0628364f4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
@@ -108,11 +108,11 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -125,41 +125,41 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199814 // smmla z20.s, z0.b, z25.b\n"
+ ".inst 0x45189811 // smmla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x451a9815 // smmla z21.s, z0.b, z26.b\n"
+ ".inst 0x45199812 // smmla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
+ ".inst 0x45189816 // smmla z22.s, z0.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x451a9813 // smmla z19.s, z0.b, z26.b\n"
+ ".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x451a9834 // smmla z20.s, z1.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n"
+ ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n"
+ ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
+ ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
"add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -171,43 +171,43 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x451a9814 // smmla z20.s, z0.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199811 // smmla z17.s, z0.b, z25.b\n"
+ ".inst 0x45189815 // smmla z21.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n"
+ ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n"
+ ".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n"
+ ".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n"
"addvl x28, x28, #8\n"
"ble 10f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45189834 // smmla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199832 // smmla z18.s, z1.b, z25.b\n"
+ ".inst 0x45189836 // smmla z22.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
+ ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
"addvl x28, x28, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
@@ -224,74 +224,74 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"uzp1 z19.d, z19.d, z23.d\n"
"mov z23.d, z16.d\n"
"tbnz %x[flags], #31, 12f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z1.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "neg z1.s, p2/M, z1.s\n"
+ "neg z16.s, p2/M, z16.s\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z1.s\n"
+ "mul z11.s, p2/M, z11.s, z16.s\n"
"12:" // Height 1: skip row sum fixup
"add z23.s, z23.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x10]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add z23.s, z23.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ "add z23.s, z23.s, z22.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z20.s\n"
+ "add z19.s, z19.s, z16.s\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n"
"addvl x10, x10, #4\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
"tbz %x[flags], #5, 13f\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z22.d, z23.d, z0.d\n"
+ "and z21.d, z17.d, z0.d\n"
+ "and z20.d, z18.d, z0.d\n"
+ "and z16.d, z19.d, z0.d\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z22.s\n"
+ "sqadd z17.s, z17.s, z21.s\n"
+ "sqadd z18.s, z18.s, z20.s\n"
+ "sqadd z19.s, z19.s, z16.s\n"
"13:" // Height 1: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z23.s, z23.s, z16.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z16.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z20.s\n"
+ "smin z17.s, p2/M, z17.s, z20.s\n"
+ "smin z18.s, p2/M, z18.s, z20.s\n"
+ "smin z19.s, p2/M, z19.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z16.s\n"
+ "smax z17.s, p2/M, z17.s, z16.s\n"
+ "smax z18.s, p2/M, z18.s, z16.s\n"
"uzp1 z23.h, z23.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z23.b, z23.b, z17.b\n"
+ "smax z19.s, p2/M, z19.s, z16.s\n"
+ "uzp1 z16.h, z18.h, z19.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
"st1b { z23.b }, p1, [x27]\n"
"addvl x27, x27, #1\n"
"14:" // Height 1: Writeback done
@@ -324,12 +324,12 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -337,49 +337,49 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"20:" // Height 2: input setup done
"cmp x25, #0x10\n"
"ble 23f\n"
"21:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1rqb { z26.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199814 // smmla z20.s, z0.b, z25.b\n"
+ ".inst 0x45189811 // smmla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x451a9815 // smmla z21.s, z0.b, z26.b\n"
+ ".inst 0x45199812 // smmla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
+ ".inst 0x45189816 // smmla z22.s, z0.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x451a9813 // smmla z19.s, z0.b, z26.b\n"
+ ".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x451a9834 // smmla z20.s, z1.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n"
+ ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n"
+ ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
+ ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"tbnz %x[flags], #31, 22f\n"
@@ -392,44 +392,44 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x451a9814 // smmla z20.s, z0.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199811 // smmla z17.s, z0.b, z25.b\n"
+ ".inst 0x45189815 // smmla z21.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n"
+ ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n"
+ ".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n"
+ ".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n"
"addvl x28, x28, #8\n"
"ble 24f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45189834 // smmla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n"
+ ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45199832 // smmla z18.s, z1.b, z25.b\n"
+ ".inst 0x45189836 // smmla z22.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n"
+ ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n"
"addvl x28, x28, #8\n"
"24:" // Height 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 25f\n"
@@ -440,133 +440,133 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 18b\n"
- "uzp1 z7.d, z16.d, z20.d\n"
+ "uzp1 z24.d, z16.d, z20.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x22, x27, x20\n"
+ "add x23, x27, x20\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "mov z23.d, z7.d\n"
+ "mov z23.d, z24.d\n"
"tbnz %x[flags], #31, 26f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z2.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "neg z2.s, p2/M, z2.s\n"
+ "neg z24.s, p2/M, z24.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z2.s\n"
- "mul z12.s, p2/M, z12.s, z2.s\n"
+ "mul z11.s, p2/M, z11.s, z24.s\n"
+ "mul z12.s, p2/M, z12.s, z24.s\n"
"26:" // Height 2: skip row sum fixup
"add z23.s, z23.s, z11.s\n"
"add z20.s, z20.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x10]\n"
+ "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z21.s, z21.s, z11.s\n"
"add z22.s, z22.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z16.s, z16.s, z12.s\n"
"add z17.s, z17.s, z12.s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z18.s, z18.s, z12.s\n"
"add z19.s, z19.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "add z23.s, z23.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z20.s, z20.s, z27.s\n"
"addvl x10, x10, #4\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ "add z21.s, z21.s, z26.s\n"
+ "add z22.s, z22.s, z25.s\n"
+ "add z16.s, z16.s, z28.s\n"
+ "add z17.s, z17.s, z27.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z18.s, z18.s, z26.s\n"
+ "add z19.s, z19.s, z25.s\n"
+ ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
+ ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
+ ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
+ ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
- "and z4.d, z23.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "and z5.d, z20.d, z0.d\n"
- "and z6.d, z21.d, z0.d\n"
- "and z7.d, z22.d, z0.d\n"
- "and z8.d, z16.d, z0.d\n"
- "and z9.d, z17.d, z0.d\n"
- "and z10.d, z18.d, z0.d\n"
- "and z4.d, z19.d, z0.d\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "sqadd z16.s, z16.s, z8.s\n"
- "sqadd z17.s, z17.s, z9.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- "sqadd z19.s, z19.s, z4.s\n"
+ "and z24.d, z23.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z24.s\n"
+ "and z30.d, z20.d, z0.d\n"
+ "and z29.d, z21.d, z0.d\n"
+ "and z28.d, z22.d, z0.d\n"
+ "and z27.d, z16.d, z0.d\n"
+ "and z26.d, z17.d, z0.d\n"
+ "and z25.d, z18.d, z0.d\n"
+ "and z24.d, z19.d, z0.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z30.s\n"
+ "sqadd z21.s, z21.s, z29.s\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z16.s, z16.s, z27.s\n"
+ "sqadd z17.s, z17.s, z26.s\n"
+ "sqadd z18.s, z18.s, z25.s\n"
+ "sqadd z19.s, z19.s, z24.s\n"
"27:" // Height 2: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z23.s, z23.s, z24.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z24.s\n"
+ "add z21.s, z21.s, z24.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z24.s\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z25.s\n"
+ "smin z20.s, p2/M, z20.s, z25.s\n"
+ "smin z21.s, p2/M, z21.s, z25.s\n"
+ "smin z22.s, p2/M, z22.s, z25.s\n"
+ "smin z16.s, p2/M, z16.s, z25.s\n"
+ "smin z17.s, p2/M, z17.s, z25.s\n"
+ "smin z18.s, p2/M, z18.s, z25.s\n"
+ "smin z19.s, p2/M, z19.s, z25.s\n"
+ "smax z23.s, p2/M, z23.s, z24.s\n"
+ "smax z20.s, p2/M, z20.s, z24.s\n"
+ "smax z21.s, p2/M, z21.s, z24.s\n"
"uzp1 z23.h, z23.h, z20.h\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z22.s, p2/M, z22.s, z24.s\n"
+ "smax z16.s, p2/M, z16.s, z24.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z23.b, z23.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z24.s\n"
+ "smax z18.s, p2/M, z18.s, z24.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
"st1b { z23.b }, p1, [x27]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z24.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x22]\n"
+ "st1b { z16.b }, p1, [x23]\n"
"addvl x27, x27, #1\n"
"28:" // Height 2: Writeback done
"decw x9, ALL, MUL #4\n"
@@ -607,13 +607,13 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -622,8 +622,8 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"34:" // Height 3: input setup done
"cmp x25, #0x10\n"
"ble 37f\n"
@@ -634,60 +634,60 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45059814 // smmla z20.s, z0.b, z5.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x4505985c // smmla z28.s, z2.b, z5.b\n"
+ ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n"
+ ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n"
+ ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n"
+ ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n"
+ ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
"add x23, x23, #0x10\n"
".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
"add x22, x22, #0x10\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n"
+ ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n"
+ ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n"
+ ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n"
+ ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
+ ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
+ ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n"
+ ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n"
+ ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
- ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n"
+ ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n"
+ ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n"
"tbnz %x[flags], #31, 36f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z13.s, z2.b, z15.b\n"
@@ -708,56 +708,56 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"trn1 z2.d, z3.d, z4.d\n"
"trn2 z3.d, z3.d, z4.d\n"
".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
"subs x25, x25, #0x8\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n"
+ ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n"
+ ".inst 0x45099859 // smmla z25.s, z2.b, z9.b\n"
".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
- ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
+ ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n"
+ ".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n"
+ ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n"
+ ".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n"
+ ".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n"
+ ".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n"
+ ".inst 0x45049817 // smmla z23.s, z0.b, z4.b\n"
+ ".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n"
"ble 38f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45059834 // smmla z20.s, z1.b, z5.b\n"
+ ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45049831 // smmla z17.s, z1.b, z4.b\n"
+ ".inst 0x45049879 // smmla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
+ ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
+ ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
+ ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n"
+ ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
- ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n"
+ ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n"
+ ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n"
"38:" // Height 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 39f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -770,12 +770,12 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"cmp x26, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z7.d, z16.d, z20.d\n"
- "add x22, x27, x20\n"
+ "uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "add x21, x22, x20\n"
+ "add x22, x23, x20\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
@@ -784,170 +784,170 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
- "mov z31.d, z7.d\n"
+ "mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 40f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z3.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "neg z3.s, p2/M, z3.s\n"
+ "neg z23.s, p2/M, z23.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z3.s\n"
+ "mul z11.s, p2/M, z11.s, z23.s\n"
"mov z13.s, z13.s[0]\n"
- "mul z12.s, p2/M, z12.s, z3.s\n"
- "mul z13.s, p2/M, z13.s, z3.s\n"
+ "mul z12.s, p2/M, z12.s, z23.s\n"
+ "mul z13.s, p2/M, z13.s, z23.s\n"
"40:" // Height 3: skip row sum fixup
"add z31.s, z31.s, z11.s\n"
"add z20.s, z20.s, z11.s\n"
"ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z21.s, z21.s, z11.s\n"
"add z22.s, z22.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z16.s, z16.s, z12.s\n"
"add z17.s, z17.s, z12.s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z18.s, z18.s, z12.s\n"
"add z19.s, z19.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add z24.s, z24.s, z13.s\n"
"add z25.s, z25.s, z13.s\n"
"addvl x10, x10, #4\n"
"add z26.s, z26.s, z13.s\n"
"add z27.s, z27.s, z13.s\n"
"add z31.s, z31.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
+ "add z20.s, z20.s, z30.s\n"
+ "add z21.s, z21.s, z29.s\n"
+ "add z22.s, z22.s, z28.s\n"
"add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "add z17.s, z17.s, z30.s\n"
+ "add z18.s, z18.s, z29.s\n"
+ "add z19.s, z19.s, z28.s\n"
"add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z26.s, z26.s, z29.s\n"
+ "add z27.s, z27.s, z28.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n"
+ ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n"
+ ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n"
+ ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n"
+ ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n"
+ ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n"
"tbz %x[flags], #5, 41f\n"
- "and z4.d, z31.d, z0.d\n"
- "and z5.d, z20.d, z0.d\n"
- "and z6.d, z21.d, z0.d\n"
- "and z7.d, z22.d, z0.d\n"
- "and z8.d, z16.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z31.s, z31.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "sqadd z16.s, z16.s, z8.s\n"
- "and z9.d, z17.d, z0.d\n"
- "and z10.d, z18.d, z0.d\n"
- "and z4.d, z19.d, z0.d\n"
- "and z5.d, z24.d, z0.d\n"
- "and z6.d, z25.d, z0.d\n"
- "and z7.d, z26.d, z0.d\n"
- "and z8.d, z27.d, z0.d\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z17.s, z17.s, z9.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- "sqadd z19.s, z19.s, z4.s\n"
- "sqadd z24.s, z24.s, z5.s\n"
- "sqadd z25.s, z25.s, z6.s\n"
- "sqadd z26.s, z26.s, z7.s\n"
- "sqadd z27.s, z27.s, z8.s\n"
+ "and z1.d, z31.d, z0.d\n"
+ "and z30.d, z20.d, z0.d\n"
+ "and z29.d, z21.d, z0.d\n"
+ "and z28.d, z22.d, z0.d\n"
+ "and z23.d, z16.d, z0.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z1.s\n"
+ "sqadd z20.s, z20.s, z30.s\n"
+ "sqadd z21.s, z21.s, z29.s\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "and z3.d, z17.d, z0.d\n"
+ "and z2.d, z18.d, z0.d\n"
+ "and z1.d, z19.d, z0.d\n"
+ "and z30.d, z24.d, z0.d\n"
+ "and z29.d, z25.d, z0.d\n"
+ "and z28.d, z26.d, z0.d\n"
+ "and z23.d, z27.d, z0.d\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z3.s\n"
+ "sqadd z18.s, z18.s, z2.s\n"
+ "sqadd z19.s, z19.s, z1.s\n"
+ "sqadd z24.s, z24.s, z30.s\n"
+ "sqadd z25.s, z25.s, z29.s\n"
+ "sqadd z26.s, z26.s, z28.s\n"
+ "sqadd z27.s, z27.s, z23.s\n"
"41:" // Height 3: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "add z31.s, z31.s, z4.s\n"
+ "add z31.s, z31.s, z23.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z23.s\n"
+ "add z21.s, z21.s, z23.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z23.s\n"
+ "add z16.s, z16.s, z23.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z23.s\n"
+ "add z18.s, z18.s, z23.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z19.s, z19.s, z23.s\n"
+ "add z24.s, z24.s, z23.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z23.s\n"
+ "add z26.s, z26.s, z23.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z27.s, z27.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z23.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
+ "smin z31.s, p2/M, z31.s, z28.s\n"
+ "smin z20.s, p2/M, z20.s, z28.s\n"
+ "smin z21.s, p2/M, z21.s, z28.s\n"
+ "smin z22.s, p2/M, z22.s, z28.s\n"
+ "smin z16.s, p2/M, z16.s, z28.s\n"
+ "smin z17.s, p2/M, z17.s, z28.s\n"
+ "smin z18.s, p2/M, z18.s, z28.s\n"
+ "smin z19.s, p2/M, z19.s, z28.s\n"
+ "smin z24.s, p2/M, z24.s, z28.s\n"
+ "smin z25.s, p2/M, z25.s, z28.s\n"
+ "smin z26.s, p2/M, z26.s, z28.s\n"
+ "smin z27.s, p2/M, z27.s, z28.s\n"
+ "smax z31.s, p2/M, z31.s, z23.s\n"
+ "smax z20.s, p2/M, z20.s, z23.s\n"
+ "smax z21.s, p2/M, z21.s, z23.s\n"
"uzp1 z31.h, z31.h, z20.h\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z22.s, p2/M, z22.s, z23.s\n"
+ "smax z16.s, p2/M, z16.s, z23.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z31.b, z31.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z23.s\n"
+ "smax z18.s, p2/M, z18.s, z23.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
"st1b { z31.b }, p1, [x27]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z23.s\n"
+ "smax z24.s, p2/M, z24.s, z23.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z23.s\n"
+ "smax z26.s, p2/M, z26.s, z23.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z16.b }, p1, [x22]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x21]\n"
+ "st1b { z16.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z23.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x22]\n"
"addvl x27, x27, #1\n"
"42:" // Height 3: Writeback done
"decw x9, ALL, MUL #4\n"
@@ -992,14 +992,14 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1009,9 +1009,9 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"b 48f\n"
"47:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"48:" // Height 4: input setup done
"cmp x25, #0x10\n"
"ble 51f\n"
@@ -1021,63 +1021,63 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ld1rqb { z2.b }, p0/Z, [x23]\n"
"trn1 z0.d, z1.d, z2.d\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "ld1rqb { z5.b }, p0/Z, [x21]\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
- ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
+ ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n"
+ ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x45059811 // smmla z17.s, z0.b, z5.b\n"
+ ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x45049815 // smmla z21.s, z0.b, z4.b\n"
+ ".inst 0x4504985d // smmla z29.s, z2.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
- ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n"
+ ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n"
+ ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n"
+ ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n"
".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
"add x24, x24, #0x10\n"
".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
+ ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n"
"add x22, x22, #0x10\n"
- ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
+ ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n"
+ ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n"
"add x21, x21, #0x10\n"
- ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
+ ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n"
+ ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
+ ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
+ ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n"
+ ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n"
+ ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
- ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n"
+ ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n"
+ ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n"
"tbnz %x[flags], #31, 50f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z13.s, z2.b, z15.b\n"
@@ -1093,62 +1093,62 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"ld1rqb { z2.b }, p0/Z, [x23]\n"
"trn1 z0.d, z1.d, z2.d\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "ld1rqb { z5.b }, p0/Z, [x21]\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n"
+ ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
"subs x25, x25, #0x8\n"
- ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n"
+ ".inst 0x45059814 // smmla z20.s, z0.b, z5.b\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n"
- ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x4505985c // smmla z28.s, z2.b, z5.b\n"
+ ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n"
- ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n"
+ ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n"
- ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n"
- ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n"
- ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n"
- ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n"
- ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n"
- ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n"
+ ".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n"
+ ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n"
+ ".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n"
+ ".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n"
+ ".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n"
+ ".inst 0x45049817 // smmla z23.s, z0.b, z4.b\n"
+ ".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n"
"ble 52f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n"
- ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n"
- ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
+ ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45059834 // smmla z20.s, z1.b, z5.b\n"
+ ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45049831 // smmla z17.s, z1.b, z4.b\n"
+ ".inst 0x45049879 // smmla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n"
- ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n"
- ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n"
- ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n"
+ ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n"
+ ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n"
- ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
+ ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n"
+ ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
- ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n"
- ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n"
+ ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n"
+ ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n"
"52:" // Height 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 53f\n"
"sdot z11.s, z0.b, z15.b\n"
@@ -1161,12 +1161,12 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"cmp x26, x20\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z7.d, z16.d, z20.d\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
- "add x20, x21, x20\n"
+ "add x21, x22, x20\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
@@ -1180,38 +1180,38 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"uzp2 z26.d, z26.d, z30.d\n"
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
- "mov z31.d, z7.d\n"
+ "mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 54f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "neg z4.s, p2/M, z4.s\n"
+ "neg z0.s, p2/M, z0.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z4.s\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
"mov z14.s, z13.s[3]\n"
"mov z13.s, z13.s[0]\n"
- "mul z12.s, p2/M, z12.s, z4.s\n"
- "mul z13.s, p2/M, z13.s, z4.s\n"
- "mul z14.s, p2/M, z14.s, z4.s\n"
+ "mul z12.s, p2/M, z12.s, z0.s\n"
+ "mul z13.s, p2/M, z13.s, z0.s\n"
+ "mul z14.s, p2/M, z14.s, z0.s\n"
"54:" // Height 4: skip row sum fixup
"add z31.s, z31.s, z11.s\n"
"add z20.s, z20.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z21.s, z21.s, z11.s\n"
"add z22.s, z22.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z16.s, z16.s, z12.s\n"
"add z17.s, z17.s, z12.s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z18.s, z18.s, z12.s\n"
"add z19.s, z19.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add z23.s, z23.s, z13.s\n"
"add z28.s, z28.s, z13.s\n"
"addvl x10, x10, #4\n"
@@ -1221,175 +1221,175 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"add z25.s, z25.s, z14.s\n"
"add z26.s, z26.s, z14.s\n"
"add z27.s, z27.s, z14.s\n"
- "add z31.s, z31.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z23.s, z23.s, z0.s\n"
- "add z28.s, z28.s, z1.s\n"
- "add z29.s, z29.s, z2.s\n"
- "add z30.s, z30.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
- ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
- ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z3.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z0.s\n"
+ "add z18.s, z18.s, z3.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "add z29.s, z29.s, z3.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z26.s, z26.s, z3.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n"
+ ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n"
+ ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n"
+ ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n"
+ ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n"
+ ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n"
+ ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n"
+ ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n"
+ ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n"
+ ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n"
+ ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
"tbz %x[flags], #5, 55f\n"
- "and z4.d, z31.d, z0.d\n"
- "and z5.d, z20.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z31.s, z31.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "and z6.d, z21.d, z0.d\n"
- "and z7.d, z22.d, z0.d\n"
- "and z8.d, z16.d, z0.d\n"
- "and z9.d, z17.d, z0.d\n"
- "and z10.d, z18.d, z0.d\n"
- "and z4.d, z19.d, z0.d\n"
- "and z5.d, z23.d, z0.d\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "and z2.d, z31.d, z0.d\n"
+ "and z1.d, z20.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z2.s\n"
+ "sqadd z20.s, z20.s, z1.s\n"
+ "and z7.d, z21.d, z0.d\n"
+ "and z6.d, z22.d, z0.d\n"
+ "and z5.d, z16.d, z0.d\n"
+ "and z4.d, z17.d, z0.d\n"
+ "and z3.d, z18.d, z0.d\n"
+ "and z2.d, z19.d, z0.d\n"
+ "and z1.d, z23.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "sqadd z16.s, z16.s, z8.s\n"
- "sqadd z17.s, z17.s, z9.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- "sqadd z19.s, z19.s, z4.s\n"
- "sqadd z23.s, z23.s, z5.s\n"
- "and z6.d, z28.d, z0.d\n"
- "and z7.d, z29.d, z0.d\n"
- "and z8.d, z30.d, z0.d\n"
- "and z9.d, z24.d, z0.d\n"
- "and z10.d, z25.d, z0.d\n"
- "and z4.d, z26.d, z0.d\n"
- "and z5.d, z27.d, z0.d\n"
"asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z7.s\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "sqadd z16.s, z16.s, z5.s\n"
+ "sqadd z17.s, z17.s, z4.s\n"
+ "sqadd z18.s, z18.s, z3.s\n"
+ "sqadd z19.s, z19.s, z2.s\n"
+ "sqadd z23.s, z23.s, z1.s\n"
+ "and z7.d, z28.d, z0.d\n"
+ "and z6.d, z29.d, z0.d\n"
+ "and z5.d, z30.d, z0.d\n"
+ "and z4.d, z24.d, z0.d\n"
+ "and z3.d, z25.d, z0.d\n"
+ "and z2.d, z26.d, z0.d\n"
+ "and z1.d, z27.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z28.s, z28.s, z6.s\n"
- "sqadd z29.s, z29.s, z7.s\n"
- "sqadd z30.s, z30.s, z8.s\n"
- "sqadd z24.s, z24.s, z9.s\n"
- "sqadd z25.s, z25.s, z10.s\n"
- "sqadd z26.s, z26.s, z4.s\n"
- "sqadd z27.s, z27.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z7.s\n"
+ "sqadd z29.s, z29.s, z6.s\n"
+ "sqadd z30.s, z30.s, z5.s\n"
+ "sqadd z24.s, z24.s, z4.s\n"
+ "sqadd z25.s, z25.s, z3.s\n"
+ "sqadd z26.s, z26.s, z2.s\n"
+ "sqadd z27.s, z27.s, z1.s\n"
"55:" // Height 4: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "add z31.s, z31.s, z4.s\n"
+ "add z31.s, z31.s, z2.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z2.s\n"
+ "add z21.s, z21.s, z2.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z23.s, z23.s, z2.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
- "add z28.s, z28.s, z4.s\n"
- "add z29.s, z29.s, z4.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "add z29.s, z29.s, z2.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z30.s, z30.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z27.s, z27.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "smin z31.s, p2/M, z31.s, z1.s\n"
+ "smin z20.s, p2/M, z20.s, z1.s\n"
+ "smin z21.s, p2/M, z21.s, z1.s\n"
+ "smin z22.s, p2/M, z22.s, z1.s\n"
+ "smin z16.s, p2/M, z16.s, z1.s\n"
+ "smin z17.s, p2/M, z17.s, z1.s\n"
+ "smin z18.s, p2/M, z18.s, z1.s\n"
+ "smin z19.s, p2/M, z19.s, z1.s\n"
+ "smin z23.s, p2/M, z23.s, z1.s\n"
+ "smin z28.s, p2/M, z28.s, z1.s\n"
+ "smin z29.s, p2/M, z29.s, z1.s\n"
+ "smin z30.s, p2/M, z30.s, z1.s\n"
+ "smin z24.s, p2/M, z24.s, z1.s\n"
+ "smin z25.s, p2/M, z25.s, z1.s\n"
+ "smin z26.s, p2/M, z26.s, z1.s\n"
+ "smin z27.s, p2/M, z27.s, z1.s\n"
+ "smax z31.s, p2/M, z31.s, z0.s\n"
+ "smax z20.s, p2/M, z20.s, z0.s\n"
+ "smax z21.s, p2/M, z21.s, z0.s\n"
"uzp1 z31.h, z31.h, z20.h\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z22.s, p2/M, z22.s, z0.s\n"
+ "smax z16.s, p2/M, z16.s, z0.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z31.b, z31.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z0.s\n"
+ "smax z18.s, p2/M, z18.s, z0.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
"st1b { z31.b }, p1, [x27]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z0.s\n"
+ "smax z23.s, p2/M, z23.s, z0.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z28.s, p2/M, z28.s, z5.s\n"
- "smax z29.s, p2/M, z29.s, z5.s\n"
+ "smax z28.s, p2/M, z28.s, z0.s\n"
+ "smax z29.s, p2/M, z29.s, z0.s\n"
"uzp1 z23.h, z23.h, z28.h\n"
- "st1b { z16.b }, p1, [x22]\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z28.h, z29.h, z30.h\n"
- "uzp1 z23.b, z23.b, z28.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "st1b { z16.b }, p1, [x23]\n"
+ "smax z30.s, p2/M, z30.s, z0.s\n"
+ "smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z29.h, z30.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z0.s\n"
+ "smax z26.s, p2/M, z26.s, z0.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z23.b }, p1, [x21]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x20]\n"
+ "st1b { z23.b }, p1, [x22]\n"
+ "smax z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x21]\n"
"addvl x27, x27, #1\n"
"56:" // Height 4: Writeback done
"decw x9, ALL, MUL #4\n"
@@ -1407,7 +1407,6 @@ void sve_hybrid_s8qa_mmla_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1415,4 +1414,4 @@ void sve_hybrid_s8qa_mmla_4x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
index dad04c81e8..056ae7a616 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, int8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -97,5 +96,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
index 1e71806838..c28717a37e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -113,11 +113,11 @@ void sve_hybrid_s8qs_dot_6x4VL (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -130,101 +130,101 @@ void sve_hybrid_s8qs_dot_6x4VL (
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z11.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "sdot z8.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sdot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sdot z11.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[2]\n"
+ "sdot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[2]\n"
+ "sdot z11.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[3]\n"
+ "sdot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z10.s, z17.b, z0.b[3]\n"
+ "sdot z11.s, z16.b, z0.b[3]\n"
"add x26, x26, #0x10\n"
"bgt 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[0]\n"
+ "sdot z11.s, z16.b, z0.b[0]\n"
"addvl x9, x9, #4\n"
"ble 9f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[1]\n"
+ "sdot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z10.s, z17.b, z0.b[1]\n"
+ "sdot z11.s, z16.b, z0.b[1]\n"
"addvl x9, x9, #4\n"
"ble 9f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[2]\n"
+ "sdot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z10.s, z17.b, z0.b[2]\n"
+ "sdot z11.s, z16.b, z0.b[2]\n"
"addvl x9, x9, #4\n"
"ble 9f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[3]\n"
+ "sdot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[3]\n"
+ "sdot z11.s, z16.b, z0.b[3]\n"
"addvl x9, x9, #4\n"
"9:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 4b\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "add z8.s, z8.s, z17.s\n"
+ "add z9.s, z9.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add z10.s, z10.s, z17.s\n"
+ "add z11.s, z11.s, z16.s\n"
"addvl x14, x14, #4\n"
"tbz %x[flags], #4, 10f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
@@ -239,10 +239,10 @@ void sve_hybrid_s8qs_dot_6x4VL (
"addvl x13, x13, #4\n"
"b 11f\n"
"10:" // Height 1: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -255,44 +255,44 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
"tbz %x[flags], #5, 12f\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
+ "and z19.d, z8.d, z0.d\n"
+ "and z18.d, z9.d, z1.d\n"
+ "and z17.d, z10.d, z2.d\n"
+ "and z16.d, z11.d, z3.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z19.s\n"
+ "sqadd z9.s, z9.s, z18.s\n"
+ "sqadd z10.s, z10.s, z17.s\n"
+ "sqadd z11.s, z11.s, z16.s\n"
"12:" // Height 1: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z8.s, z8.s, z16.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z16.s\n"
+ "add z10.s, z10.s, z16.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z11.s, z11.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
+ "add z11.s, z11.s, z16.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z17.s\n"
+ "smin z9.s, p2/M, z9.s, z17.s\n"
+ "smin z10.s, p2/M, z10.s, z17.s\n"
+ "smin z11.s, p2/M, z11.s, z17.s\n"
+ "smax z8.s, p2/M, z8.s, z16.s\n"
+ "smax z9.s, p2/M, z9.s, z16.s\n"
+ "smax z10.s, p2/M, z10.s, z16.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
+ "smax z11.s, p2/M, z11.s, z16.s\n"
+ "uzp1 z16.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z16.b\n"
"st1b { z8.b }, p1, [x11]\n"
"addvl x11, x11, #1\n"
"13:" // Height 1: Writeback done
@@ -323,12 +323,12 @@ void sve_hybrid_s8qs_dot_6x4VL (
"17:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 18f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -336,150 +336,150 @@ void sve_hybrid_s8qs_dot_6x4VL (
"b 19f\n"
"18:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"19:" // Height 2: input setup done
"cmp x27, #0x10\n"
"ble 21f\n"
"20:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z1.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[0]\n"
+ "sdot z12.s, z17.b, z0.b[0]\n"
+ "sdot z9.s, z16.b, z1.b[0]\n"
+ "sdot z13.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z1.b[0]\n"
+ "sdot z14.s, z17.b, z0.b[0]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
"cmp x27, #0x10\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sdot z11.s, z16.b, z1.b[0]\n"
+ "sdot z15.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
"add x26, x26, #0x10\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[1]\n"
+ "sdot z12.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
"add x25, x25, #0x10\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "sdot z9.s, z16.b, z1.b[1]\n"
+ "sdot z13.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z10.s, z17.b, z1.b[1]\n"
+ "sdot z14.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "sdot z11.s, z16.b, z1.b[1]\n"
+ "sdot z15.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[2]\n"
+ "sdot z12.s, z17.b, z0.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "sdot z9.s, z16.b, z1.b[2]\n"
+ "sdot z13.s, z16.b, z0.b[2]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "sdot z10.s, z17.b, z1.b[2]\n"
+ "sdot z14.s, z17.b, z0.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "sdot z11.s, z16.b, z1.b[2]\n"
+ "sdot z15.s, z16.b, z0.b[2]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[3]\n"
+ "sdot z12.s, z17.b, z0.b[3]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "sdot z9.s, z16.b, z1.b[3]\n"
+ "sdot z13.s, z16.b, z0.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sdot z10.s, z17.b, z1.b[3]\n"
+ "sdot z14.s, z17.b, z0.b[3]\n"
+ "sdot z11.s, z16.b, z1.b[3]\n"
+ "sdot z15.s, z16.b, z0.b[3]\n"
"bgt 20b\n"
"21:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"subs x27, x27, #0x4\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[0]\n"
+ "sdot z12.s, z17.b, z1.b[0]\n"
+ "sdot z9.s, z16.b, z0.b[0]\n"
+ "sdot z13.s, z16.b, z1.b[0]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[0]\n"
+ "sdot z14.s, z17.b, z1.b[0]\n"
"addvl x9, x9, #4\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z11.s, z16.b, z0.b[0]\n"
+ "sdot z15.s, z16.b, z1.b[0]\n"
"ble 22f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[1]\n"
+ "sdot z12.s, z17.b, z1.b[1]\n"
+ "sdot z9.s, z16.b, z0.b[1]\n"
+ "sdot z13.s, z16.b, z1.b[1]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z10.s, z17.b, z0.b[1]\n"
+ "sdot z14.s, z17.b, z1.b[1]\n"
"addvl x9, x9, #4\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z11.s, z16.b, z0.b[1]\n"
+ "sdot z15.s, z16.b, z1.b[1]\n"
"ble 22f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[2]\n"
+ "sdot z12.s, z17.b, z1.b[2]\n"
+ "sdot z9.s, z16.b, z0.b[2]\n"
+ "sdot z13.s, z16.b, z1.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z10.s, z17.b, z0.b[2]\n"
+ "sdot z14.s, z17.b, z1.b[2]\n"
"addvl x9, x9, #4\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z11.s, z16.b, z0.b[2]\n"
+ "sdot z15.s, z16.b, z1.b[2]\n"
"ble 22f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[3]\n"
+ "sdot z12.s, z17.b, z1.b[3]\n"
+ "sdot z9.s, z16.b, z0.b[3]\n"
+ "sdot z13.s, z16.b, z1.b[3]\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[3]\n"
+ "sdot z14.s, z17.b, z1.b[3]\n"
"addvl x9, x9, #4\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z11.s, z16.b, z0.b[3]\n"
+ "sdot z15.s, z16.b, z1.b[3]\n"
"22:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 17b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "add x24, x11, x20\n"
- "add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add z11.s, z11.s, z3.s\n"
- "add z12.s, z12.s, z0.s\n"
+ "ld1w { z19.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "add z8.s, z8.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "add z9.s, z9.s, z18.s\n"
+ "add z10.s, z10.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add z11.s, z11.s, z16.s\n"
+ "add z12.s, z12.s, z19.s\n"
"addvl x14, x14, #4\n"
- "add z13.s, z13.s, z1.s\n"
- "add z14.s, z14.s, z2.s\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z13.s, z13.s, z18.s\n"
+ "add z14.s, z14.s, z17.s\n"
+ "add z15.s, z15.s, z16.s\n"
"tbz %x[flags], #4, 23f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -493,10 +493,10 @@ void sve_hybrid_s8qs_dot_6x4VL (
"addvl x13, x13, #4\n"
"b 24f\n"
"23:" // Height 2: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -513,77 +513,77 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
"tbz %x[flags], #5, 25f\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z12.d, z0.d\n"
- "and z5.d, z13.d, z1.d\n"
- "and z6.d, z14.d, z2.d\n"
- "and z7.d, z15.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z12.s, z12.s, z4.s\n"
- "sqadd z13.s, z13.s, z5.s\n"
- "sqadd z14.s, z14.s, z6.s\n"
- "sqadd z15.s, z15.s, z7.s\n"
+ "and z19.d, z8.d, z0.d\n"
+ "and z18.d, z9.d, z1.d\n"
+ "and z17.d, z10.d, z2.d\n"
+ "and z16.d, z11.d, z3.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z19.s\n"
+ "sqadd z9.s, z9.s, z18.s\n"
+ "sqadd z10.s, z10.s, z17.s\n"
+ "sqadd z11.s, z11.s, z16.s\n"
+ "and z19.d, z12.d, z0.d\n"
+ "and z18.d, z13.d, z1.d\n"
+ "and z17.d, z14.d, z2.d\n"
+ "and z16.d, z15.d, z3.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z19.s\n"
+ "sqadd z13.s, z13.s, z18.s\n"
+ "sqadd z14.s, z14.s, z17.s\n"
+ "sqadd z15.s, z15.s, z16.s\n"
"25:" // Height 2: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z8.s, z8.s, z17.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z17.s\n"
+ "add z10.s, z10.s, z17.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z12.s, z12.s, z4.s\n"
+ "add z11.s, z11.s, z17.s\n"
+ "add z12.s, z12.s, z17.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z13.s, z13.s, z4.s\n"
- "add z14.s, z14.s, z4.s\n"
+ "add z13.s, z13.s, z17.s\n"
+ "add z14.s, z14.s, z17.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z15.s, z15.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "add z15.s, z15.s, z17.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z16.s\n"
+ "smin z9.s, p2/M, z9.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z16.s\n"
+ "smin z11.s, p2/M, z11.s, z16.s\n"
+ "smin z12.s, p2/M, z12.s, z16.s\n"
+ "smin z13.s, p2/M, z13.s, z16.s\n"
+ "smin z14.s, p2/M, z14.s, z16.s\n"
+ "smin z15.s, p2/M, z15.s, z16.s\n"
+ "smax z8.s, p2/M, z8.s, z17.s\n"
+ "smax z9.s, p2/M, z9.s, z17.s\n"
+ "smax z10.s, p2/M, z10.s, z17.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z17.s\n"
+ "smax z12.s, p2/M, z12.s, z17.s\n"
+ "uzp1 z16.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z16.b\n"
+ "smax z13.s, p2/M, z13.s, z17.s\n"
+ "smax z14.s, p2/M, z14.s, z17.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"st1b { z8.b }, p1, [x11]\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x24]\n"
+ "smax z15.s, p2/M, z15.s, z17.s\n"
+ "uzp1 z16.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z16.b\n"
+ "st1b { z12.b }, p1, [x26]\n"
"addvl x11, x11, #1\n"
"26:" // Height 2: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -617,13 +617,13 @@ void sve_hybrid_s8qs_dot_6x4VL (
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 32f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -632,86 +632,86 @@ void sve_hybrid_s8qs_dot_6x4VL (
"b 32f\n"
"31:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"32:" // Height 3: input setup done
"cmp x27, #0x10\n"
"ble 34f\n"
"33:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z21.b, z2.b[0]\n"
+ "sdot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z0.b[0]\n"
+ "sdot z9.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[0]\n"
+ "sdot z17.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
"cmp x27, #0x10\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z10.s, z21.b, z2.b[0]\n"
+ "sdot z14.s, z21.b, z1.b[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[0]\n"
+ "sdot z11.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "sdot z15.s, z20.b, z1.b[0]\n"
+ "sdot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sdot z8.s, z21.b, z2.b[1]\n"
+ "sdot z12.s, z21.b, z1.b[1]\n"
+ "sdot z16.s, z21.b, z0.b[1]\n"
+ "sdot z9.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[1]\n"
+ "sdot z17.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z10.s, z21.b, z2.b[1]\n"
+ "sdot z14.s, z21.b, z1.b[1]\n"
+ "sdot z18.s, z21.b, z0.b[1]\n"
+ "sdot z11.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "sdot z15.s, z20.b, z1.b[1]\n"
+ "sdot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "sdot z8.s, z21.b, z2.b[2]\n"
+ "sdot z12.s, z21.b, z1.b[2]\n"
+ "sdot z16.s, z21.b, z0.b[2]\n"
+ "sdot z9.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[2]\n"
+ "sdot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "sdot z10.s, z21.b, z2.b[2]\n"
+ "sdot z14.s, z21.b, z1.b[2]\n"
+ "sdot z18.s, z21.b, z0.b[2]\n"
+ "sdot z11.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "sdot z15.s, z20.b, z1.b[2]\n"
+ "sdot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ "sdot z8.s, z21.b, z2.b[3]\n"
+ "sdot z12.s, z21.b, z1.b[3]\n"
+ "sdot z16.s, z21.b, z0.b[3]\n"
+ "sdot z9.s, z20.b, z2.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[3]\n"
+ "sdot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sdot z10.s, z21.b, z2.b[3]\n"
+ "sdot z14.s, z21.b, z1.b[3]\n"
+ "sdot z18.s, z21.b, z0.b[3]\n"
+ "sdot z11.s, z20.b, z2.b[3]\n"
+ "sdot z15.s, z20.b, z1.b[3]\n"
+ "sdot z19.s, z20.b, z0.b[3]\n"
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -719,104 +719,104 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z21.b, z0.b[0]\n"
+ "sdot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z2.b[0]\n"
+ "sdot z9.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[0]\n"
+ "sdot z17.s, z20.b, z2.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z10.s, z21.b, z0.b[0]\n"
+ "sdot z14.s, z21.b, z1.b[0]\n"
+ "sdot z18.s, z21.b, z2.b[0]\n"
+ "sdot z11.s, z20.b, z0.b[0]\n"
+ "sdot z15.s, z20.b, z1.b[0]\n"
+ "sdot z19.s, z20.b, z2.b[0]\n"
"ble 35f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z21.b, z0.b[1]\n"
+ "sdot z12.s, z21.b, z1.b[1]\n"
+ "sdot z16.s, z21.b, z2.b[1]\n"
+ "sdot z9.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[1]\n"
+ "sdot z17.s, z20.b, z2.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z10.s, z21.b, z0.b[1]\n"
+ "sdot z14.s, z21.b, z1.b[1]\n"
+ "sdot z18.s, z21.b, z2.b[1]\n"
+ "sdot z11.s, z20.b, z0.b[1]\n"
+ "sdot z15.s, z20.b, z1.b[1]\n"
+ "sdot z19.s, z20.b, z2.b[1]\n"
"ble 35f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z21.b, z0.b[2]\n"
+ "sdot z12.s, z21.b, z1.b[2]\n"
+ "sdot z16.s, z21.b, z2.b[2]\n"
+ "sdot z9.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[2]\n"
+ "sdot z17.s, z20.b, z2.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z10.s, z21.b, z0.b[2]\n"
+ "sdot z14.s, z21.b, z1.b[2]\n"
+ "sdot z18.s, z21.b, z2.b[2]\n"
+ "sdot z11.s, z20.b, z0.b[2]\n"
+ "sdot z15.s, z20.b, z1.b[2]\n"
+ "sdot z19.s, z20.b, z2.b[2]\n"
"ble 35f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x9]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z21.b, z0.b[3]\n"
+ "sdot z12.s, z21.b, z1.b[3]\n"
+ "sdot z16.s, z21.b, z2.b[3]\n"
+ "sdot z9.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[3]\n"
+ "sdot z17.s, z20.b, z2.b[3]\n"
+ "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z10.s, z21.b, z0.b[3]\n"
+ "sdot z14.s, z21.b, z1.b[3]\n"
+ "sdot z18.s, z21.b, z2.b[3]\n"
+ "sdot z11.s, z20.b, z0.b[3]\n"
+ "sdot z15.s, z20.b, z1.b[3]\n"
+ "sdot z19.s, z20.b, z2.b[3]\n"
"35:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 30b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
+ "ld1w { z23.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "add x25, x26, x20\n"
+ "ld1w { z22.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "add z8.s, z8.s, z23.s\n"
+ "add z9.s, z9.s, z22.s\n"
+ "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add z10.s, z10.s, z21.s\n"
+ "add z11.s, z11.s, z20.s\n"
"addvl x14, x14, #4\n"
- "add z12.s, z12.s, z0.s\n"
- "add z13.s, z13.s, z1.s\n"
- "add z14.s, z14.s, z2.s\n"
- "add z15.s, z15.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "add z12.s, z12.s, z23.s\n"
+ "add z13.s, z13.s, z22.s\n"
+ "add z14.s, z14.s, z21.s\n"
+ "add z15.s, z15.s, z20.s\n"
+ "add z16.s, z16.s, z23.s\n"
+ "add z17.s, z17.s, z22.s\n"
+ "add z18.s, z18.s, z21.s\n"
+ "add z19.s, z19.s, z20.s\n"
"tbz %x[flags], #4, 36f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -830,10 +830,10 @@ void sve_hybrid_s8qs_dot_6x4VL (
"addvl x13, x13, #4\n"
"b 37f\n"
"36:" // Height 3: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -854,109 +854,109 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
"tbz %x[flags], #5, 38f\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z12.d, z0.d\n"
- "and z5.d, z13.d, z1.d\n"
- "and z6.d, z14.d, z2.d\n"
- "and z7.d, z15.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z12.s, z12.s, z4.s\n"
- "sqadd z13.s, z13.s, z5.s\n"
- "sqadd z14.s, z14.s, z6.s\n"
- "sqadd z15.s, z15.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z23.d, z8.d, z0.d\n"
+ "and z22.d, z9.d, z1.d\n"
+ "and z21.d, z10.d, z2.d\n"
+ "and z20.d, z11.d, z3.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z23.s\n"
+ "sqadd z9.s, z9.s, z22.s\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "sqadd z11.s, z11.s, z20.s\n"
+ "and z23.d, z12.d, z0.d\n"
+ "and z22.d, z13.d, z1.d\n"
+ "and z21.d, z14.d, z2.d\n"
+ "and z20.d, z15.d, z3.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z23.s\n"
+ "sqadd z13.s, z13.s, z22.s\n"
+ "sqadd z14.s, z14.s, z21.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "and z23.d, z16.d, z0.d\n"
+ "and z22.d, z17.d, z1.d\n"
+ "and z21.d, z18.d, z2.d\n"
+ "and z20.d, z19.d, z3.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "sqadd z17.s, z17.s, z22.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
"38:" // Height 3: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z8.s, z8.s, z21.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z21.s\n"
+ "add z10.s, z10.s, z21.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z12.s, z12.s, z4.s\n"
+ "add z11.s, z11.s, z21.s\n"
+ "add z12.s, z12.s, z21.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z13.s, z13.s, z4.s\n"
- "add z14.s, z14.s, z4.s\n"
+ "add z13.s, z13.s, z21.s\n"
+ "add z14.s, z14.s, z21.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z15.s, z15.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z15.s, z15.s, z21.s\n"
+ "add z16.s, z16.s, z21.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z21.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z21.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z20.s\n"
+ "smin z9.s, p2/M, z9.s, z20.s\n"
+ "smin z10.s, p2/M, z10.s, z20.s\n"
+ "smin z11.s, p2/M, z11.s, z20.s\n"
+ "smin z12.s, p2/M, z12.s, z20.s\n"
+ "smin z13.s, p2/M, z13.s, z20.s\n"
+ "smin z14.s, p2/M, z14.s, z20.s\n"
+ "smin z15.s, p2/M, z15.s, z20.s\n"
+ "smin z16.s, p2/M, z16.s, z20.s\n"
+ "smin z17.s, p2/M, z17.s, z20.s\n"
+ "smin z18.s, p2/M, z18.s, z20.s\n"
+ "smin z19.s, p2/M, z19.s, z20.s\n"
+ "smax z8.s, p2/M, z8.s, z21.s\n"
+ "smax z9.s, p2/M, z9.s, z21.s\n"
+ "smax z10.s, p2/M, z10.s, z21.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z21.s\n"
+ "smax z12.s, p2/M, z12.s, z21.s\n"
+ "uzp1 z20.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z20.b\n"
+ "smax z13.s, p2/M, z13.s, z21.s\n"
+ "smax z14.s, p2/M, z14.s, z21.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"st1b { z8.b }, p1, [x11]\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z21.s\n"
+ "smax z16.s, p2/M, z16.s, z21.s\n"
+ "uzp1 z20.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z20.b\n"
+ "smax z17.s, p2/M, z17.s, z21.s\n"
+ "smax z18.s, p2/M, z18.s, z21.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z12.b }, p1, [x24]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z21.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x23]\n"
+ "st1b { z16.b }, p1, [x25]\n"
"addvl x11, x11, #1\n"
"39:" // Height 3: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -994,14 +994,14 @@ void sve_hybrid_s8qs_dot_6x4VL (
"43:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 44f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 45f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1011,105 +1011,105 @@ void sve_hybrid_s8qs_dot_6x4VL (
"b 45f\n"
"44:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"45:" // Height 4: input setup done
"cmp x27, #0x10\n"
"ble 47f\n"
"46:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z3.b }, p0/Z, [x26]\n"
+ "ld1rqb { z2.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[0]\n"
+ "sdot z12.s, z25.b, z2.b[0]\n"
+ "sdot z16.s, z25.b, z1.b[0]\n"
+ "sdot z20.s, z25.b, z0.b[0]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z9.s, z24.b, z3.b[0]\n"
+ "sdot z13.s, z24.b, z2.b[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "sdot z17.s, z24.b, z1.b[0]\n"
+ "sdot z21.s, z24.b, z0.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z25.b, z3.b[0]\n"
+ "sdot z14.s, z25.b, z2.b[0]\n"
+ "sdot z18.s, z25.b, z1.b[0]\n"
+ "sdot z22.s, z25.b, z0.b[0]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "sdot z11.s, z24.b, z3.b[0]\n"
+ "sdot z15.s, z24.b, z2.b[0]\n"
+ "sdot z19.s, z24.b, z1.b[0]\n"
+ "sdot z23.s, z24.b, z0.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[1]\n"
+ "sdot z12.s, z25.b, z2.b[1]\n"
+ "sdot z16.s, z25.b, z1.b[1]\n"
+ "sdot z20.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z9.s, z24.b, z3.b[1]\n"
+ "sdot z13.s, z24.b, z2.b[1]\n"
+ "sdot z17.s, z24.b, z1.b[1]\n"
+ "sdot z21.s, z24.b, z0.b[1]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z10.s, z25.b, z3.b[1]\n"
+ "sdot z14.s, z25.b, z2.b[1]\n"
+ "sdot z18.s, z25.b, z1.b[1]\n"
+ "sdot z22.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "sdot z11.s, z24.b, z3.b[1]\n"
+ "sdot z15.s, z24.b, z2.b[1]\n"
+ "sdot z19.s, z24.b, z1.b[1]\n"
+ "sdot z23.s, z24.b, z0.b[1]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[2]\n"
+ "sdot z12.s, z25.b, z2.b[2]\n"
+ "sdot z16.s, z25.b, z1.b[2]\n"
+ "sdot z20.s, z25.b, z0.b[2]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "sdot z9.s, z24.b, z3.b[2]\n"
+ "sdot z13.s, z24.b, z2.b[2]\n"
+ "sdot z17.s, z24.b, z1.b[2]\n"
+ "sdot z21.s, z24.b, z0.b[2]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "sdot z10.s, z25.b, z3.b[2]\n"
+ "sdot z14.s, z25.b, z2.b[2]\n"
+ "sdot z18.s, z25.b, z1.b[2]\n"
+ "sdot z22.s, z25.b, z0.b[2]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "sdot z11.s, z24.b, z3.b[2]\n"
+ "sdot z15.s, z24.b, z2.b[2]\n"
+ "sdot z19.s, z24.b, z1.b[2]\n"
+ "sdot z23.s, z24.b, z0.b[2]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[3]\n"
+ "sdot z12.s, z25.b, z2.b[3]\n"
+ "sdot z16.s, z25.b, z1.b[3]\n"
+ "sdot z20.s, z25.b, z0.b[3]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "sdot z9.s, z24.b, z3.b[3]\n"
+ "sdot z13.s, z24.b, z2.b[3]\n"
+ "sdot z17.s, z24.b, z1.b[3]\n"
+ "sdot z21.s, z24.b, z0.b[3]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sdot z10.s, z25.b, z3.b[3]\n"
+ "sdot z14.s, z25.b, z2.b[3]\n"
+ "sdot z18.s, z25.b, z1.b[3]\n"
+ "sdot z22.s, z25.b, z0.b[3]\n"
+ "sdot z11.s, z24.b, z3.b[3]\n"
+ "sdot z15.s, z24.b, z2.b[3]\n"
+ "sdot z19.s, z24.b, z1.b[3]\n"
+ "sdot z23.s, z24.b, z0.b[3]\n"
"bgt 46b\n"
"47:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -1118,125 +1118,125 @@ void sve_hybrid_s8qs_dot_6x4VL (
"subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[0]\n"
+ "sdot z12.s, z25.b, z1.b[0]\n"
+ "sdot z16.s, z25.b, z2.b[0]\n"
+ "sdot z20.s, z25.b, z3.b[0]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[0]\n"
+ "sdot z13.s, z24.b, z1.b[0]\n"
+ "sdot z17.s, z24.b, z2.b[0]\n"
+ "sdot z21.s, z24.b, z3.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z10.s, z25.b, z0.b[0]\n"
+ "sdot z14.s, z25.b, z1.b[0]\n"
+ "sdot z18.s, z25.b, z2.b[0]\n"
+ "sdot z22.s, z25.b, z3.b[0]\n"
+ "sdot z11.s, z24.b, z0.b[0]\n"
+ "sdot z15.s, z24.b, z1.b[0]\n"
+ "sdot z19.s, z24.b, z2.b[0]\n"
+ "sdot z23.s, z24.b, z3.b[0]\n"
"ble 48f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[1]\n"
+ "sdot z12.s, z25.b, z1.b[1]\n"
+ "sdot z16.s, z25.b, z2.b[1]\n"
+ "sdot z20.s, z25.b, z3.b[1]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[1]\n"
+ "sdot z13.s, z24.b, z1.b[1]\n"
+ "sdot z17.s, z24.b, z2.b[1]\n"
+ "sdot z21.s, z24.b, z3.b[1]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z10.s, z25.b, z0.b[1]\n"
+ "sdot z14.s, z25.b, z1.b[1]\n"
+ "sdot z18.s, z25.b, z2.b[1]\n"
+ "sdot z22.s, z25.b, z3.b[1]\n"
+ "sdot z11.s, z24.b, z0.b[1]\n"
+ "sdot z15.s, z24.b, z1.b[1]\n"
+ "sdot z19.s, z24.b, z2.b[1]\n"
+ "sdot z23.s, z24.b, z3.b[1]\n"
"ble 48f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[2]\n"
+ "sdot z12.s, z25.b, z1.b[2]\n"
+ "sdot z16.s, z25.b, z2.b[2]\n"
+ "sdot z20.s, z25.b, z3.b[2]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[2]\n"
+ "sdot z13.s, z24.b, z1.b[2]\n"
+ "sdot z17.s, z24.b, z2.b[2]\n"
+ "sdot z21.s, z24.b, z3.b[2]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z10.s, z25.b, z0.b[2]\n"
+ "sdot z14.s, z25.b, z1.b[2]\n"
+ "sdot z18.s, z25.b, z2.b[2]\n"
+ "sdot z22.s, z25.b, z3.b[2]\n"
+ "sdot z11.s, z24.b, z0.b[2]\n"
+ "sdot z15.s, z24.b, z1.b[2]\n"
+ "sdot z19.s, z24.b, z2.b[2]\n"
+ "sdot z23.s, z24.b, z3.b[2]\n"
"ble 48f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[3]\n"
+ "sdot z12.s, z25.b, z1.b[3]\n"
+ "sdot z16.s, z25.b, z2.b[3]\n"
+ "sdot z20.s, z25.b, z3.b[3]\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[3]\n"
+ "sdot z13.s, z24.b, z1.b[3]\n"
+ "sdot z17.s, z24.b, z2.b[3]\n"
+ "sdot z21.s, z24.b, z3.b[3]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z10.s, z25.b, z0.b[3]\n"
+ "sdot z14.s, z25.b, z1.b[3]\n"
+ "sdot z18.s, z25.b, z2.b[3]\n"
+ "sdot z22.s, z25.b, z3.b[3]\n"
+ "sdot z11.s, z24.b, z0.b[3]\n"
+ "sdot z15.s, z24.b, z1.b[3]\n"
+ "sdot z19.s, z24.b, z2.b[3]\n"
+ "sdot z23.s, z24.b, z3.b[3]\n"
"48:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 43b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "add x24, x11, x20\n"
- "add x23, x24, x20\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "add x22, x23, x20\n"
- "add z8.s, z8.s, z0.s\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
+ "ld1w { z27.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "add x25, x26, x20\n"
+ "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x24, x25, x20\n"
+ "add z8.s, z8.s, z27.s\n"
+ "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add z9.s, z9.s, z26.s\n"
+ "add z10.s, z10.s, z25.s\n"
"addvl x14, x14, #4\n"
- "add z11.s, z11.s, z3.s\n"
- "add z12.s, z12.s, z0.s\n"
- "add z13.s, z13.s, z1.s\n"
- "add z14.s, z14.s, z2.s\n"
- "add z15.s, z15.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
+ "add z11.s, z11.s, z24.s\n"
+ "add z12.s, z12.s, z27.s\n"
+ "add z13.s, z13.s, z26.s\n"
+ "add z14.s, z14.s, z25.s\n"
+ "add z15.s, z15.s, z24.s\n"
+ "add z16.s, z16.s, z27.s\n"
+ "add z17.s, z17.s, z26.s\n"
+ "add z18.s, z18.s, z25.s\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add z20.s, z20.s, z27.s\n"
+ "add z21.s, z21.s, z26.s\n"
+ "add z22.s, z22.s, z25.s\n"
+ "add z23.s, z23.s, z24.s\n"
"tbz %x[flags], #4, 49f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1250,10 +1250,10 @@ void sve_hybrid_s8qs_dot_6x4VL (
"addvl x13, x13, #4\n"
"b 50f\n"
"49:" // Height 4: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -1278,141 +1278,141 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
"tbz %x[flags], #5, 51f\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z12.d, z0.d\n"
- "and z5.d, z13.d, z1.d\n"
- "and z6.d, z14.d, z2.d\n"
- "and z7.d, z15.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z12.s, z12.s, z4.s\n"
- "sqadd z13.s, z13.s, z5.s\n"
- "sqadd z14.s, z14.s, z6.s\n"
- "sqadd z15.s, z15.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "and z4.d, z20.d, z0.d\n"
- "and z5.d, z21.d, z1.d\n"
- "and z6.d, z22.d, z2.d\n"
- "and z7.d, z23.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z20.s, z20.s, z4.s\n"
- "sqadd z21.s, z21.s, z5.s\n"
- "sqadd z22.s, z22.s, z6.s\n"
- "sqadd z23.s, z23.s, z7.s\n"
+ "and z27.d, z8.d, z0.d\n"
+ "and z26.d, z9.d, z1.d\n"
+ "and z25.d, z10.d, z2.d\n"
+ "and z24.d, z11.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z27.s\n"
+ "sqadd z9.s, z9.s, z26.s\n"
+ "sqadd z10.s, z10.s, z25.s\n"
+ "sqadd z11.s, z11.s, z24.s\n"
+ "and z27.d, z12.d, z0.d\n"
+ "and z26.d, z13.d, z1.d\n"
+ "and z25.d, z14.d, z2.d\n"
+ "and z24.d, z15.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z27.s\n"
+ "sqadd z13.s, z13.s, z26.s\n"
+ "sqadd z14.s, z14.s, z25.s\n"
+ "sqadd z15.s, z15.s, z24.s\n"
+ "and z27.d, z16.d, z0.d\n"
+ "and z26.d, z17.d, z1.d\n"
+ "and z25.d, z18.d, z2.d\n"
+ "and z24.d, z19.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z27.s\n"
+ "sqadd z17.s, z17.s, z26.s\n"
+ "sqadd z18.s, z18.s, z25.s\n"
+ "sqadd z19.s, z19.s, z24.s\n"
+ "and z27.d, z20.d, z0.d\n"
+ "and z26.d, z21.d, z1.d\n"
+ "and z25.d, z22.d, z2.d\n"
+ "and z24.d, z23.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z27.s\n"
+ "sqadd z21.s, z21.s, z26.s\n"
+ "sqadd z22.s, z22.s, z25.s\n"
+ "sqadd z23.s, z23.s, z24.s\n"
"51:" // Height 4: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z8.s, z8.s, z25.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z25.s\n"
+ "add z10.s, z10.s, z25.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z12.s, z12.s, z4.s\n"
+ "add z11.s, z11.s, z25.s\n"
+ "add z12.s, z12.s, z25.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z13.s, z13.s, z4.s\n"
- "add z14.s, z14.s, z4.s\n"
+ "add z13.s, z13.s, z25.s\n"
+ "add z14.s, z14.s, z25.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z15.s, z15.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z15.s, z15.s, z25.s\n"
+ "add z16.s, z16.s, z25.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z25.s\n"
+ "add z18.s, z18.s, z25.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z25.s\n"
+ "add z20.s, z20.s, z25.s\n"
".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z25.s\n"
+ "add z22.s, z22.s, z25.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z23.s, z23.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z25.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z24.s\n"
+ "smin z9.s, p2/M, z9.s, z24.s\n"
+ "smin z10.s, p2/M, z10.s, z24.s\n"
+ "smin z11.s, p2/M, z11.s, z24.s\n"
+ "smin z12.s, p2/M, z12.s, z24.s\n"
+ "smin z13.s, p2/M, z13.s, z24.s\n"
+ "smin z14.s, p2/M, z14.s, z24.s\n"
+ "smin z15.s, p2/M, z15.s, z24.s\n"
+ "smin z16.s, p2/M, z16.s, z24.s\n"
+ "smin z17.s, p2/M, z17.s, z24.s\n"
+ "smin z18.s, p2/M, z18.s, z24.s\n"
+ "smin z19.s, p2/M, z19.s, z24.s\n"
+ "smin z20.s, p2/M, z20.s, z24.s\n"
+ "smin z21.s, p2/M, z21.s, z24.s\n"
+ "smin z22.s, p2/M, z22.s, z24.s\n"
+ "smin z23.s, p2/M, z23.s, z24.s\n"
+ "smax z8.s, p2/M, z8.s, z25.s\n"
+ "smax z9.s, p2/M, z9.s, z25.s\n"
+ "smax z10.s, p2/M, z10.s, z25.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z25.s\n"
+ "smax z12.s, p2/M, z12.s, z25.s\n"
+ "uzp1 z24.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z24.b\n"
+ "smax z13.s, p2/M, z13.s, z25.s\n"
+ "smax z14.s, p2/M, z14.s, z25.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"st1b { z8.b }, p1, [x11]\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z25.s\n"
+ "smax z16.s, p2/M, z16.s, z25.s\n"
+ "uzp1 z24.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z24.b\n"
+ "smax z17.s, p2/M, z17.s, z25.s\n"
+ "smax z18.s, p2/M, z18.s, z25.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z12.b }, p1, [x24]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z25.s\n"
+ "smax z20.s, p2/M, z20.s, z25.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z25.s\n"
+ "smax z22.s, p2/M, z22.s, z25.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "st1b { z16.b }, p1, [x23]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x22]\n"
+ "st1b { z16.b }, p1, [x25]\n"
+ "smax z23.s, p2/M, z23.s, z25.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "st1b { z20.b }, p1, [x24]\n"
"addvl x11, x11, #1\n"
"52:" // Height 4: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -1454,15 +1454,15 @@ void sve_hybrid_s8qs_dot_6x4VL (
"56:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 57f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1473,124 +1473,124 @@ void sve_hybrid_s8qs_dot_6x4VL (
"b 58f\n"
"57:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"58:" // Height 5: input setup done
"cmp x27, #0x10\n"
"ble 60f\n"
"59:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x26]\n"
+ "ld1rqb { z3.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1rqb { z0.b }, p0/Z, [x22]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z29.b, z4.b[0]\n"
+ "sdot z12.s, z29.b, z3.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z16.s, z29.b, z2.b[0]\n"
+ "sdot z20.s, z29.b, z1.b[0]\n"
"add x25, x25, #0x10\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z24.s, z29.b, z0.b[0]\n"
+ "sdot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
"add x24, x24, #0x10\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z13.s, z28.b, z3.b[0]\n"
+ "sdot z17.s, z28.b, z2.b[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "sdot z21.s, z28.b, z1.b[0]\n"
+ "sdot z25.s, z28.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z29.b, z4.b[0]\n"
+ "sdot z14.s, z29.b, z3.b[0]\n"
+ "sdot z18.s, z29.b, z2.b[0]\n"
+ "sdot z22.s, z29.b, z1.b[0]\n"
+ "sdot z26.s, z29.b, z0.b[0]\n"
+ "sdot z11.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[0]\n"
+ "sdot z19.s, z28.b, z2.b[0]\n"
+ "sdot z23.s, z28.b, z1.b[0]\n"
+ "sdot z27.s, z28.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sdot z8.s, z29.b, z4.b[1]\n"
+ "sdot z12.s, z29.b, z3.b[1]\n"
+ "sdot z16.s, z29.b, z2.b[1]\n"
+ "sdot z20.s, z29.b, z1.b[1]\n"
+ "sdot z24.s, z29.b, z0.b[1]\n"
+ "sdot z9.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z13.s, z28.b, z3.b[1]\n"
+ "sdot z17.s, z28.b, z2.b[1]\n"
+ "sdot z21.s, z28.b, z1.b[1]\n"
+ "sdot z25.s, z28.b, z0.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z10.s, z29.b, z4.b[1]\n"
+ "sdot z14.s, z29.b, z3.b[1]\n"
+ "sdot z18.s, z29.b, z2.b[1]\n"
+ "sdot z22.s, z29.b, z1.b[1]\n"
+ "sdot z26.s, z29.b, z0.b[1]\n"
+ "sdot z11.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[1]\n"
+ "sdot z19.s, z28.b, z2.b[1]\n"
+ "sdot z23.s, z28.b, z1.b[1]\n"
+ "sdot z27.s, z28.b, z0.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "sdot z8.s, z29.b, z4.b[2]\n"
+ "sdot z12.s, z29.b, z3.b[2]\n"
+ "sdot z16.s, z29.b, z2.b[2]\n"
+ "sdot z20.s, z29.b, z1.b[2]\n"
+ "sdot z24.s, z29.b, z0.b[2]\n"
+ "sdot z9.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "sdot z13.s, z28.b, z3.b[2]\n"
+ "sdot z17.s, z28.b, z2.b[2]\n"
+ "sdot z21.s, z28.b, z1.b[2]\n"
+ "sdot z25.s, z28.b, z0.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "sdot z10.s, z29.b, z4.b[2]\n"
+ "sdot z14.s, z29.b, z3.b[2]\n"
+ "sdot z18.s, z29.b, z2.b[2]\n"
+ "sdot z22.s, z29.b, z1.b[2]\n"
+ "sdot z26.s, z29.b, z0.b[2]\n"
+ "sdot z11.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[2]\n"
+ "sdot z19.s, z28.b, z2.b[2]\n"
+ "sdot z23.s, z28.b, z1.b[2]\n"
+ "sdot z27.s, z28.b, z0.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ "sdot z8.s, z29.b, z4.b[3]\n"
+ "sdot z12.s, z29.b, z3.b[3]\n"
+ "sdot z16.s, z29.b, z2.b[3]\n"
+ "sdot z20.s, z29.b, z1.b[3]\n"
+ "sdot z24.s, z29.b, z0.b[3]\n"
+ "sdot z9.s, z28.b, z4.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "sdot z13.s, z28.b, z3.b[3]\n"
+ "sdot z17.s, z28.b, z2.b[3]\n"
+ "sdot z21.s, z28.b, z1.b[3]\n"
+ "sdot z25.s, z28.b, z0.b[3]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sdot z10.s, z29.b, z4.b[3]\n"
+ "sdot z14.s, z29.b, z3.b[3]\n"
+ "sdot z18.s, z29.b, z2.b[3]\n"
+ "sdot z22.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z0.b[3]\n"
+ "sdot z11.s, z28.b, z4.b[3]\n"
+ "sdot z15.s, z28.b, z3.b[3]\n"
+ "sdot z19.s, z28.b, z2.b[3]\n"
+ "sdot z23.s, z28.b, z1.b[3]\n"
+ "sdot z27.s, z28.b, z0.b[3]\n"
"bgt 59b\n"
"60:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -1600,146 +1600,146 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "sdot z8.s, z29.b, z0.b[0]\n"
+ "sdot z12.s, z29.b, z1.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z16.s, z29.b, z2.b[0]\n"
+ "sdot z20.s, z29.b, z3.b[0]\n"
+ "sdot z24.s, z29.b, z4.b[0]\n"
+ "sdot z9.s, z28.b, z0.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[0]\n"
+ "sdot z17.s, z28.b, z2.b[0]\n"
+ "sdot z21.s, z28.b, z3.b[0]\n"
+ "sdot z25.s, z28.b, z4.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z10.s, z29.b, z0.b[0]\n"
+ "sdot z14.s, z29.b, z1.b[0]\n"
+ "sdot z18.s, z29.b, z2.b[0]\n"
+ "sdot z22.s, z29.b, z3.b[0]\n"
+ "sdot z26.s, z29.b, z4.b[0]\n"
+ "sdot z11.s, z28.b, z0.b[0]\n"
+ "sdot z15.s, z28.b, z1.b[0]\n"
+ "sdot z19.s, z28.b, z2.b[0]\n"
+ "sdot z23.s, z28.b, z3.b[0]\n"
+ "sdot z27.s, z28.b, z4.b[0]\n"
"ble 61f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z29.b, z0.b[1]\n"
+ "sdot z12.s, z29.b, z1.b[1]\n"
+ "sdot z16.s, z29.b, z2.b[1]\n"
+ "sdot z20.s, z29.b, z3.b[1]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z4.b[1]\n"
+ "sdot z9.s, z28.b, z0.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[1]\n"
+ "sdot z17.s, z28.b, z2.b[1]\n"
+ "sdot z21.s, z28.b, z3.b[1]\n"
+ "sdot z25.s, z28.b, z4.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z10.s, z29.b, z0.b[1]\n"
+ "sdot z14.s, z29.b, z1.b[1]\n"
+ "sdot z18.s, z29.b, z2.b[1]\n"
+ "sdot z22.s, z29.b, z3.b[1]\n"
+ "sdot z26.s, z29.b, z4.b[1]\n"
+ "sdot z11.s, z28.b, z0.b[1]\n"
+ "sdot z15.s, z28.b, z1.b[1]\n"
+ "sdot z19.s, z28.b, z2.b[1]\n"
+ "sdot z23.s, z28.b, z3.b[1]\n"
+ "sdot z27.s, z28.b, z4.b[1]\n"
"ble 61f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z29.b, z0.b[2]\n"
+ "sdot z12.s, z29.b, z1.b[2]\n"
+ "sdot z16.s, z29.b, z2.b[2]\n"
+ "sdot z20.s, z29.b, z3.b[2]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z4.b[2]\n"
+ "sdot z9.s, z28.b, z0.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[2]\n"
+ "sdot z17.s, z28.b, z2.b[2]\n"
+ "sdot z21.s, z28.b, z3.b[2]\n"
+ "sdot z25.s, z28.b, z4.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z10.s, z29.b, z0.b[2]\n"
+ "sdot z14.s, z29.b, z1.b[2]\n"
+ "sdot z18.s, z29.b, z2.b[2]\n"
+ "sdot z22.s, z29.b, z3.b[2]\n"
+ "sdot z26.s, z29.b, z4.b[2]\n"
+ "sdot z11.s, z28.b, z0.b[2]\n"
+ "sdot z15.s, z28.b, z1.b[2]\n"
+ "sdot z19.s, z28.b, z2.b[2]\n"
+ "sdot z23.s, z28.b, z3.b[2]\n"
+ "sdot z27.s, z28.b, z4.b[2]\n"
"ble 61f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x9]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z29.b, z0.b[3]\n"
+ "sdot z12.s, z29.b, z1.b[3]\n"
+ "sdot z16.s, z29.b, z2.b[3]\n"
+ "sdot z20.s, z29.b, z3.b[3]\n"
+ "sdot z24.s, z29.b, z4.b[3]\n"
+ "sdot z9.s, z28.b, z0.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[3]\n"
+ "sdot z17.s, z28.b, z2.b[3]\n"
+ "sdot z21.s, z28.b, z3.b[3]\n"
+ "sdot z25.s, z28.b, z4.b[3]\n"
+ "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z10.s, z29.b, z0.b[3]\n"
+ "sdot z14.s, z29.b, z1.b[3]\n"
+ "sdot z18.s, z29.b, z2.b[3]\n"
+ "sdot z22.s, z29.b, z3.b[3]\n"
+ "sdot z26.s, z29.b, z4.b[3]\n"
+ "sdot z11.s, z28.b, z0.b[3]\n"
+ "sdot z15.s, z28.b, z1.b[3]\n"
+ "sdot z19.s, z28.b, z2.b[3]\n"
+ "sdot z23.s, z28.b, z3.b[3]\n"
+ "sdot z27.s, z28.b, z4.b[3]\n"
"61:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 56b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
+ "add x26, x11, x20\n"
+ "ld1w { z31.s }, p2/Z, [x14]\n"
+ "add x25, x26, x20\n"
+ "ld1w { z30.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
+ "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add z8.s, z8.s, z31.s\n"
+ "add z9.s, z9.s, z30.s\n"
"addvl x14, x14, #4\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
- "add z12.s, z12.s, z0.s\n"
- "add z13.s, z13.s, z1.s\n"
- "add z14.s, z14.s, z2.s\n"
- "add z15.s, z15.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
+ "add z10.s, z10.s, z29.s\n"
+ "add z11.s, z11.s, z28.s\n"
+ "add z12.s, z12.s, z31.s\n"
+ "add z13.s, z13.s, z30.s\n"
+ "add z14.s, z14.s, z29.s\n"
+ "add z15.s, z15.s, z28.s\n"
+ "add z16.s, z16.s, z31.s\n"
+ "add z17.s, z17.s, z30.s\n"
+ "add z18.s, z18.s, z29.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z20.s, z20.s, z31.s\n"
+ "add z21.s, z21.s, z30.s\n"
+ "add z22.s, z22.s, z29.s\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z24.s, z24.s, z31.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "add z26.s, z26.s, z29.s\n"
+ "add z27.s, z27.s, z28.s\n"
"tbz %x[flags], #4, 62f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1753,10 +1753,10 @@ void sve_hybrid_s8qs_dot_6x4VL (
"addvl x13, x13, #4\n"
"b 63f\n"
"62:" // Height 5: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -1785,173 +1785,173 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
"tbz %x[flags], #5, 64f\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z12.d, z0.d\n"
- "and z5.d, z13.d, z1.d\n"
- "and z6.d, z14.d, z2.d\n"
- "and z7.d, z15.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z12.s, z12.s, z4.s\n"
- "sqadd z13.s, z13.s, z5.s\n"
- "sqadd z14.s, z14.s, z6.s\n"
- "sqadd z15.s, z15.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "and z4.d, z20.d, z0.d\n"
- "and z5.d, z21.d, z1.d\n"
- "and z6.d, z22.d, z2.d\n"
- "and z7.d, z23.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z20.s, z20.s, z4.s\n"
- "sqadd z21.s, z21.s, z5.s\n"
- "sqadd z22.s, z22.s, z6.s\n"
- "sqadd z23.s, z23.s, z7.s\n"
- "and z4.d, z24.d, z0.d\n"
- "and z5.d, z25.d, z1.d\n"
- "and z6.d, z26.d, z2.d\n"
- "and z7.d, z27.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z24.s, z24.s, z4.s\n"
- "sqadd z25.s, z25.s, z5.s\n"
- "sqadd z26.s, z26.s, z6.s\n"
- "sqadd z27.s, z27.s, z7.s\n"
+ "and z31.d, z8.d, z0.d\n"
+ "and z30.d, z9.d, z1.d\n"
+ "and z29.d, z10.d, z2.d\n"
+ "and z28.d, z11.d, z3.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z31.s\n"
+ "sqadd z9.s, z9.s, z30.s\n"
+ "sqadd z10.s, z10.s, z29.s\n"
+ "sqadd z11.s, z11.s, z28.s\n"
+ "and z31.d, z12.d, z0.d\n"
+ "and z30.d, z13.d, z1.d\n"
+ "and z29.d, z14.d, z2.d\n"
+ "and z28.d, z15.d, z3.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z31.s\n"
+ "sqadd z13.s, z13.s, z30.s\n"
+ "sqadd z14.s, z14.s, z29.s\n"
+ "sqadd z15.s, z15.s, z28.s\n"
+ "and z31.d, z16.d, z0.d\n"
+ "and z30.d, z17.d, z1.d\n"
+ "and z29.d, z18.d, z2.d\n"
+ "and z28.d, z19.d, z3.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z31.s\n"
+ "sqadd z17.s, z17.s, z30.s\n"
+ "sqadd z18.s, z18.s, z29.s\n"
+ "sqadd z19.s, z19.s, z28.s\n"
+ "and z31.d, z20.d, z0.d\n"
+ "and z30.d, z21.d, z1.d\n"
+ "and z29.d, z22.d, z2.d\n"
+ "and z28.d, z23.d, z3.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z31.s\n"
+ "sqadd z21.s, z21.s, z30.s\n"
+ "sqadd z22.s, z22.s, z29.s\n"
+ "sqadd z23.s, z23.s, z28.s\n"
+ "and z31.d, z24.d, z0.d\n"
+ "and z30.d, z25.d, z1.d\n"
+ "and z29.d, z26.d, z2.d\n"
+ "and z28.d, z27.d, z3.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z31.s\n"
+ "sqadd z25.s, z25.s, z30.s\n"
+ "sqadd z26.s, z26.s, z29.s\n"
+ "sqadd z27.s, z27.s, z28.s\n"
"64:" // Height 5: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z8.s, z8.s, z29.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z29.s\n"
+ "add z10.s, z10.s, z29.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z12.s, z12.s, z4.s\n"
+ "add z11.s, z11.s, z29.s\n"
+ "add z12.s, z12.s, z29.s\n"
".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
- "add z13.s, z13.s, z4.s\n"
- "add z14.s, z14.s, z4.s\n"
+ "add z13.s, z13.s, z29.s\n"
+ "add z14.s, z14.s, z29.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z15.s, z15.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z15.s, z15.s, z29.s\n"
+ "add z16.s, z16.s, z29.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z29.s\n"
+ "add z18.s, z18.s, z29.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z29.s\n"
+ "add z20.s, z20.s, z29.s\n"
".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z29.s\n"
+ "add z22.s, z22.s, z29.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z23.s, z23.s, z29.s\n"
+ "add z24.s, z24.s, z29.s\n"
".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z29.s\n"
+ "add z26.s, z26.s, z29.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z27.s, z27.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z29.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z28.s\n"
+ "smin z9.s, p2/M, z9.s, z28.s\n"
+ "smin z10.s, p2/M, z10.s, z28.s\n"
+ "smin z11.s, p2/M, z11.s, z28.s\n"
+ "smin z12.s, p2/M, z12.s, z28.s\n"
+ "smin z13.s, p2/M, z13.s, z28.s\n"
+ "smin z14.s, p2/M, z14.s, z28.s\n"
+ "smin z15.s, p2/M, z15.s, z28.s\n"
+ "smin z16.s, p2/M, z16.s, z28.s\n"
+ "smin z17.s, p2/M, z17.s, z28.s\n"
+ "smin z18.s, p2/M, z18.s, z28.s\n"
+ "smin z19.s, p2/M, z19.s, z28.s\n"
+ "smin z20.s, p2/M, z20.s, z28.s\n"
+ "smin z21.s, p2/M, z21.s, z28.s\n"
+ "smin z22.s, p2/M, z22.s, z28.s\n"
+ "smin z23.s, p2/M, z23.s, z28.s\n"
+ "smin z24.s, p2/M, z24.s, z28.s\n"
+ "smin z25.s, p2/M, z25.s, z28.s\n"
+ "smin z26.s, p2/M, z26.s, z28.s\n"
+ "smin z27.s, p2/M, z27.s, z28.s\n"
+ "smax z8.s, p2/M, z8.s, z29.s\n"
+ "smax z9.s, p2/M, z9.s, z29.s\n"
+ "smax z10.s, p2/M, z10.s, z29.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z29.s\n"
+ "smax z12.s, p2/M, z12.s, z29.s\n"
+ "uzp1 z28.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z28.b\n"
+ "smax z13.s, p2/M, z13.s, z29.s\n"
+ "smax z14.s, p2/M, z14.s, z29.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"st1b { z8.b }, p1, [x11]\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z29.s\n"
+ "smax z16.s, p2/M, z16.s, z29.s\n"
+ "uzp1 z28.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z28.b\n"
+ "smax z17.s, p2/M, z17.s, z29.s\n"
+ "smax z18.s, p2/M, z18.s, z29.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z12.b }, p1, [x24]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z29.s\n"
+ "smax z20.s, p2/M, z20.s, z29.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z29.s\n"
+ "smax z22.s, p2/M, z22.s, z29.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "st1b { z16.b }, p1, [x23]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "st1b { z16.b }, p1, [x25]\n"
+ "smax z23.s, p2/M, z23.s, z29.s\n"
+ "smax z24.s, p2/M, z24.s, z29.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z29.s\n"
+ "smax z26.s, p2/M, z26.s, z29.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z20.b }, p1, [x22]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x21]\n"
+ "st1b { z20.b }, p1, [x24]\n"
+ "smax z27.s, p2/M, z27.s, z29.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
"addvl x11, x11, #1\n"
"65:" // Height 5: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -2000,16 +2000,16 @@ void sve_hybrid_s8qs_dot_6x4VL (
"69:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 70f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 71f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -2021,143 +2021,143 @@ void sve_hybrid_s8qs_dot_6x4VL (
"b 71f\n"
"70:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"71:" // Height 6: input setup done
"cmp x27, #0x10\n"
"ble 73f\n"
"72:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z6.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z4.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1rqb { z5.b }, p0/Z, [x21]\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z1.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[0]\n"
+ "sdot z12.s, z1.b, z6.b[0]\n"
+ "sdot z16.s, z1.b, z5.b[0]\n"
+ "sdot z20.s, z1.b, z4.b[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z24.s, z1.b, z3.b[0]\n"
+ "sdot z28.s, z1.b, z2.b[0]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
"add x21, x21, #0x10\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z30.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
- "sdot z31.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[0]\n"
+ "sdot z13.s, z0.b, z6.b[0]\n"
+ "sdot z17.s, z0.b, z5.b[0]\n"
+ "sdot z21.s, z0.b, z4.b[0]\n"
+ "sdot z25.s, z0.b, z3.b[0]\n"
+ "sdot z29.s, z0.b, z2.b[0]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z10.s, z1.b, z7.b[0]\n"
+ "sdot z14.s, z1.b, z6.b[0]\n"
+ "sdot z18.s, z1.b, z5.b[0]\n"
+ "sdot z22.s, z1.b, z4.b[0]\n"
+ "sdot z26.s, z1.b, z3.b[0]\n"
+ "sdot z30.s, z1.b, z2.b[0]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "sdot z11.s, z0.b, z7.b[0]\n"
+ "sdot z15.s, z0.b, z6.b[0]\n"
+ "sdot z19.s, z0.b, z5.b[0]\n"
+ "sdot z23.s, z0.b, z4.b[0]\n"
+ "sdot z27.s, z0.b, z3.b[0]\n"
+ "sdot z31.s, z0.b, z2.b[0]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[1]\n"
+ "sdot z12.s, z1.b, z6.b[1]\n"
+ "sdot z16.s, z1.b, z5.b[1]\n"
+ "sdot z20.s, z1.b, z4.b[1]\n"
+ "sdot z24.s, z1.b, z3.b[1]\n"
+ "sdot z28.s, z1.b, z2.b[1]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[1]\n"
+ "sdot z13.s, z0.b, z6.b[1]\n"
+ "sdot z17.s, z0.b, z5.b[1]\n"
+ "sdot z21.s, z0.b, z4.b[1]\n"
+ "sdot z25.s, z0.b, z3.b[1]\n"
+ "sdot z29.s, z0.b, z2.b[1]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z30.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
- "sdot z31.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z30.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
- "sdot z31.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z30.s, z6.b, z5.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
- "sdot z31.s, z7.b, z5.b[3]\n"
+ "sdot z10.s, z1.b, z7.b[1]\n"
+ "sdot z14.s, z1.b, z6.b[1]\n"
+ "sdot z18.s, z1.b, z5.b[1]\n"
+ "sdot z22.s, z1.b, z4.b[1]\n"
+ "sdot z26.s, z1.b, z3.b[1]\n"
+ "sdot z30.s, z1.b, z2.b[1]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "sdot z11.s, z0.b, z7.b[1]\n"
+ "sdot z15.s, z0.b, z6.b[1]\n"
+ "sdot z19.s, z0.b, z5.b[1]\n"
+ "sdot z23.s, z0.b, z4.b[1]\n"
+ "sdot z27.s, z0.b, z3.b[1]\n"
+ "sdot z31.s, z0.b, z2.b[1]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[2]\n"
+ "sdot z12.s, z1.b, z6.b[2]\n"
+ "sdot z16.s, z1.b, z5.b[2]\n"
+ "sdot z20.s, z1.b, z4.b[2]\n"
+ "sdot z24.s, z1.b, z3.b[2]\n"
+ "sdot z28.s, z1.b, z2.b[2]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[2]\n"
+ "sdot z13.s, z0.b, z6.b[2]\n"
+ "sdot z17.s, z0.b, z5.b[2]\n"
+ "sdot z21.s, z0.b, z4.b[2]\n"
+ "sdot z25.s, z0.b, z3.b[2]\n"
+ "sdot z29.s, z0.b, z2.b[2]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ "sdot z10.s, z1.b, z7.b[2]\n"
+ "sdot z14.s, z1.b, z6.b[2]\n"
+ "sdot z18.s, z1.b, z5.b[2]\n"
+ "sdot z22.s, z1.b, z4.b[2]\n"
+ "sdot z26.s, z1.b, z3.b[2]\n"
+ "sdot z30.s, z1.b, z2.b[2]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "sdot z11.s, z0.b, z7.b[2]\n"
+ "sdot z15.s, z0.b, z6.b[2]\n"
+ "sdot z19.s, z0.b, z5.b[2]\n"
+ "sdot z23.s, z0.b, z4.b[2]\n"
+ "sdot z27.s, z0.b, z3.b[2]\n"
+ "sdot z31.s, z0.b, z2.b[2]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[3]\n"
+ "sdot z12.s, z1.b, z6.b[3]\n"
+ "sdot z16.s, z1.b, z5.b[3]\n"
+ "sdot z20.s, z1.b, z4.b[3]\n"
+ "sdot z24.s, z1.b, z3.b[3]\n"
+ "sdot z28.s, z1.b, z2.b[3]\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[3]\n"
+ "sdot z13.s, z0.b, z6.b[3]\n"
+ "sdot z17.s, z0.b, z5.b[3]\n"
+ "sdot z21.s, z0.b, z4.b[3]\n"
+ "sdot z25.s, z0.b, z3.b[3]\n"
+ "sdot z29.s, z0.b, z2.b[3]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "sdot z10.s, z1.b, z7.b[3]\n"
+ "sdot z14.s, z1.b, z6.b[3]\n"
+ "sdot z18.s, z1.b, z5.b[3]\n"
+ "sdot z22.s, z1.b, z4.b[3]\n"
+ "sdot z26.s, z1.b, z3.b[3]\n"
+ "sdot z30.s, z1.b, z2.b[3]\n"
+ "sdot z11.s, z0.b, z7.b[3]\n"
+ "sdot z15.s, z0.b, z6.b[3]\n"
+ "sdot z19.s, z0.b, z5.b[3]\n"
+ "sdot z23.s, z0.b, z4.b[3]\n"
+ "sdot z27.s, z0.b, z3.b[3]\n"
+ "sdot z31.s, z0.b, z2.b[3]\n"
"bgt 72b\n"
"73:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -2168,167 +2168,167 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[0]\n"
+ "sdot z12.s, z7.b, z1.b[0]\n"
+ "sdot z16.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z7.b, z3.b[0]\n"
+ "sdot z24.s, z7.b, z4.b[0]\n"
+ "sdot z28.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[0]\n"
+ "sdot z13.s, z6.b, z1.b[0]\n"
+ "sdot z17.s, z6.b, z2.b[0]\n"
+ "sdot z21.s, z6.b, z3.b[0]\n"
+ "sdot z25.s, z6.b, z4.b[0]\n"
+ "sdot z29.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z30.s, z6.b, z5.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
- "sdot z31.s, z7.b, z5.b[0]\n"
+ "sdot z10.s, z7.b, z0.b[0]\n"
+ "sdot z14.s, z7.b, z1.b[0]\n"
+ "sdot z18.s, z7.b, z2.b[0]\n"
+ "sdot z22.s, z7.b, z3.b[0]\n"
+ "sdot z26.s, z7.b, z4.b[0]\n"
+ "sdot z30.s, z7.b, z5.b[0]\n"
+ "sdot z11.s, z6.b, z0.b[0]\n"
+ "sdot z15.s, z6.b, z1.b[0]\n"
+ "sdot z19.s, z6.b, z2.b[0]\n"
+ "sdot z23.s, z6.b, z3.b[0]\n"
+ "sdot z27.s, z6.b, z4.b[0]\n"
+ "sdot z31.s, z6.b, z5.b[0]\n"
"ble 74f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[1]\n"
+ "sdot z12.s, z7.b, z1.b[1]\n"
+ "sdot z16.s, z7.b, z2.b[1]\n"
+ "sdot z20.s, z7.b, z3.b[1]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z4.b[1]\n"
+ "sdot z28.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[1]\n"
+ "sdot z13.s, z6.b, z1.b[1]\n"
+ "sdot z17.s, z6.b, z2.b[1]\n"
+ "sdot z21.s, z6.b, z3.b[1]\n"
+ "sdot z25.s, z6.b, z4.b[1]\n"
+ "sdot z29.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z30.s, z6.b, z5.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
- "sdot z31.s, z7.b, z5.b[1]\n"
+ "sdot z10.s, z7.b, z0.b[1]\n"
+ "sdot z14.s, z7.b, z1.b[1]\n"
+ "sdot z18.s, z7.b, z2.b[1]\n"
+ "sdot z22.s, z7.b, z3.b[1]\n"
+ "sdot z26.s, z7.b, z4.b[1]\n"
+ "sdot z30.s, z7.b, z5.b[1]\n"
+ "sdot z11.s, z6.b, z0.b[1]\n"
+ "sdot z15.s, z6.b, z1.b[1]\n"
+ "sdot z19.s, z6.b, z2.b[1]\n"
+ "sdot z23.s, z6.b, z3.b[1]\n"
+ "sdot z27.s, z6.b, z4.b[1]\n"
+ "sdot z31.s, z6.b, z5.b[1]\n"
"ble 74f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[2]\n"
+ "sdot z12.s, z7.b, z1.b[2]\n"
+ "sdot z16.s, z7.b, z2.b[2]\n"
+ "sdot z20.s, z7.b, z3.b[2]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z4.b[2]\n"
+ "sdot z28.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[2]\n"
+ "sdot z13.s, z6.b, z1.b[2]\n"
+ "sdot z17.s, z6.b, z2.b[2]\n"
+ "sdot z21.s, z6.b, z3.b[2]\n"
+ "sdot z25.s, z6.b, z4.b[2]\n"
+ "sdot z29.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z30.s, z6.b, z5.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
- "sdot z31.s, z7.b, z5.b[2]\n"
+ "sdot z10.s, z7.b, z0.b[2]\n"
+ "sdot z14.s, z7.b, z1.b[2]\n"
+ "sdot z18.s, z7.b, z2.b[2]\n"
+ "sdot z22.s, z7.b, z3.b[2]\n"
+ "sdot z26.s, z7.b, z4.b[2]\n"
+ "sdot z30.s, z7.b, z5.b[2]\n"
+ "sdot z11.s, z6.b, z0.b[2]\n"
+ "sdot z15.s, z6.b, z1.b[2]\n"
+ "sdot z19.s, z6.b, z2.b[2]\n"
+ "sdot z23.s, z6.b, z3.b[2]\n"
+ "sdot z27.s, z6.b, z4.b[2]\n"
+ "sdot z31.s, z6.b, z5.b[2]\n"
"ble 74f\n"
- "ld1b { z6.b }, p2/Z, [x9]\n"
- "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x9]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[3]\n"
+ "sdot z12.s, z7.b, z1.b[3]\n"
+ "sdot z16.s, z7.b, z2.b[3]\n"
+ "sdot z20.s, z7.b, z3.b[3]\n"
+ "sdot z24.s, z7.b, z4.b[3]\n"
+ "sdot z28.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[3]\n"
+ "sdot z13.s, z6.b, z1.b[3]\n"
+ "sdot z17.s, z6.b, z2.b[3]\n"
+ "sdot z21.s, z6.b, z3.b[3]\n"
+ "sdot z25.s, z6.b, z4.b[3]\n"
+ "sdot z29.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z30.s, z6.b, z5.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
- "sdot z31.s, z7.b, z5.b[3]\n"
+ "sdot z10.s, z7.b, z0.b[3]\n"
+ "sdot z14.s, z7.b, z1.b[3]\n"
+ "sdot z18.s, z7.b, z2.b[3]\n"
+ "sdot z22.s, z7.b, z3.b[3]\n"
+ "sdot z26.s, z7.b, z4.b[3]\n"
+ "sdot z30.s, z7.b, z5.b[3]\n"
+ "sdot z11.s, z6.b, z0.b[3]\n"
+ "sdot z15.s, z6.b, z1.b[3]\n"
+ "sdot z19.s, z6.b, z2.b[3]\n"
+ "sdot z23.s, z6.b, z3.b[3]\n"
+ "sdot z27.s, z6.b, z4.b[3]\n"
+ "sdot z31.s, z6.b, z5.b[3]\n"
"74:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 69b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x11, x20\n"
+ "add x26, x11, x20\n"
+ "add x25, x26, x20\n"
+ "ld1w { z3.s }, p2/Z, [x14]\n"
+ "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x24, x25, x20\n"
"add x23, x24, x20\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
"add x22, x23, x20\n"
- "add x21, x22, x20\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add x20, x21, x20\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
+ "add z8.s, z8.s, z3.s\n"
+ "add z9.s, z9.s, z2.s\n"
+ "add z10.s, z10.s, z1.s\n"
+ "add z11.s, z11.s, z0.s\n"
"addvl x14, x14, #4\n"
- "add z12.s, z12.s, z0.s\n"
- "add z13.s, z13.s, z1.s\n"
- "add z14.s, z14.s, z2.s\n"
- "add z15.s, z15.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- "add z28.s, z28.s, z0.s\n"
- "add z29.s, z29.s, z1.s\n"
- "add z30.s, z30.s, z2.s\n"
- "add z31.s, z31.s, z3.s\n"
+ "add z12.s, z12.s, z3.s\n"
+ "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z1.s\n"
+ "add z15.s, z15.s, z0.s\n"
+ "add z16.s, z16.s, z3.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z1.s\n"
+ "add z19.s, z19.s, z0.s\n"
+ "add z20.s, z20.s, z3.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "add z22.s, z22.s, z1.s\n"
+ "add z23.s, z23.s, z0.s\n"
+ "add z24.s, z24.s, z3.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z1.s\n"
+ "add z27.s, z27.s, z0.s\n"
+ "add z28.s, z28.s, z3.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z30.s, z30.s, z1.s\n"
+ "add z31.s, z31.s, z0.s\n"
"tbz %x[flags], #4, 75f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -2342,10 +2342,10 @@ void sve_hybrid_s8qs_dot_6x4VL (
"addvl x13, x13, #4\n"
"b 76f\n"
"75:" // Height 6: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -2378,81 +2378,81 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n"
".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n"
"tbz %x[flags], #5, 77f\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "and z7.d, z8.d, z0.d\n"
+ "and z6.d, z9.d, z1.d\n"
+ "and z5.d, z10.d, z2.d\n"
+ "and z4.d, z11.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z12.d, z0.d\n"
- "and z5.d, z13.d, z1.d\n"
- "and z6.d, z14.d, z2.d\n"
- "and z7.d, z15.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z12.s, z12.s, z4.s\n"
- "sqadd z13.s, z13.s, z5.s\n"
- "sqadd z14.s, z14.s, z6.s\n"
- "sqadd z15.s, z15.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "and z4.d, z20.d, z0.d\n"
- "and z5.d, z21.d, z1.d\n"
- "and z6.d, z22.d, z2.d\n"
- "and z7.d, z23.d, z3.d\n"
"asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z7.s\n"
+ "sqadd z9.s, z9.s, z6.s\n"
+ "sqadd z10.s, z10.s, z5.s\n"
+ "sqadd z11.s, z11.s, z4.s\n"
+ "and z7.d, z12.d, z0.d\n"
+ "and z6.d, z13.d, z1.d\n"
+ "and z5.d, z14.d, z2.d\n"
+ "and z4.d, z15.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z20.s, z20.s, z4.s\n"
- "sqadd z21.s, z21.s, z5.s\n"
- "sqadd z22.s, z22.s, z6.s\n"
- "sqadd z23.s, z23.s, z7.s\n"
- "and z4.d, z24.d, z0.d\n"
- "and z5.d, z25.d, z1.d\n"
- "and z6.d, z26.d, z2.d\n"
- "and z7.d, z27.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z7.s\n"
+ "sqadd z13.s, z13.s, z6.s\n"
+ "sqadd z14.s, z14.s, z5.s\n"
+ "sqadd z15.s, z15.s, z4.s\n"
+ "and z7.d, z16.d, z0.d\n"
+ "and z6.d, z17.d, z1.d\n"
+ "and z5.d, z18.d, z2.d\n"
+ "and z4.d, z19.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z24.s, z24.s, z4.s\n"
- "sqadd z25.s, z25.s, z5.s\n"
- "sqadd z26.s, z26.s, z6.s\n"
- "sqadd z27.s, z27.s, z7.s\n"
- "and z4.d, z28.d, z0.d\n"
- "and z5.d, z29.d, z1.d\n"
- "and z6.d, z30.d, z2.d\n"
- "and z7.d, z31.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z7.s\n"
+ "sqadd z17.s, z17.s, z6.s\n"
+ "sqadd z18.s, z18.s, z5.s\n"
+ "sqadd z19.s, z19.s, z4.s\n"
+ "and z7.d, z20.d, z0.d\n"
+ "and z6.d, z21.d, z1.d\n"
+ "and z5.d, z22.d, z2.d\n"
+ "and z4.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z7.s\n"
+ "sqadd z21.s, z21.s, z6.s\n"
+ "sqadd z22.s, z22.s, z5.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z7.d, z24.d, z0.d\n"
+ "and z6.d, z25.d, z1.d\n"
+ "and z5.d, z26.d, z2.d\n"
+ "and z4.d, z27.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z7.s\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "sqadd z26.s, z26.s, z5.s\n"
+ "sqadd z27.s, z27.s, z4.s\n"
+ "and z7.d, z28.d, z0.d\n"
+ "and z6.d, z29.d, z1.d\n"
+ "and z5.d, z30.d, z2.d\n"
+ "and z4.d, z31.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z28.s, z28.s, z4.s\n"
- "sqadd z29.s, z29.s, z5.s\n"
- "sqadd z30.s, z30.s, z6.s\n"
- "sqadd z31.s, z31.s, z7.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z7.s\n"
+ "sqadd z29.s, z29.s, z6.s\n"
+ "sqadd z30.s, z30.s, z5.s\n"
+ "sqadd z31.s, z31.s, z4.s\n"
"77:" // Height 6: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
"add z8.s, z8.s, z4.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
@@ -2500,83 +2500,83 @@ void sve_hybrid_s8qs_dot_6x4VL (
"add z29.s, z29.s, z4.s\n"
"add z30.s, z30.s, z4.s\n"
".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
"add z31.s, z31.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "smin z8.s, p2/M, z8.s, z0.s\n"
+ "smin z9.s, p2/M, z9.s, z0.s\n"
+ "smin z10.s, p2/M, z10.s, z0.s\n"
+ "smin z11.s, p2/M, z11.s, z0.s\n"
+ "smin z12.s, p2/M, z12.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z0.s\n"
+ "smin z14.s, p2/M, z14.s, z0.s\n"
+ "smin z15.s, p2/M, z15.s, z0.s\n"
+ "smin z16.s, p2/M, z16.s, z0.s\n"
+ "smin z17.s, p2/M, z17.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z0.s\n"
+ "smin z19.s, p2/M, z19.s, z0.s\n"
+ "smin z20.s, p2/M, z20.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z0.s\n"
+ "smin z22.s, p2/M, z22.s, z0.s\n"
+ "smin z23.s, p2/M, z23.s, z0.s\n"
+ "smin z24.s, p2/M, z24.s, z0.s\n"
+ "smin z25.s, p2/M, z25.s, z0.s\n"
+ "smin z26.s, p2/M, z26.s, z0.s\n"
+ "smin z27.s, p2/M, z27.s, z0.s\n"
+ "smin z28.s, p2/M, z28.s, z0.s\n"
+ "smin z29.s, p2/M, z29.s, z0.s\n"
+ "smin z30.s, p2/M, z30.s, z0.s\n"
+ "smin z31.s, p2/M, z31.s, z0.s\n"
+ "smax z8.s, p2/M, z8.s, z1.s\n"
+ "smax z9.s, p2/M, z9.s, z1.s\n"
+ "smax z10.s, p2/M, z10.s, z1.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z1.s\n"
+ "smax z12.s, p2/M, z12.s, z1.s\n"
+ "uzp1 z0.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z0.b\n"
+ "smax z13.s, p2/M, z13.s, z1.s\n"
+ "smax z14.s, p2/M, z14.s, z1.s\n"
"uzp1 z12.h, z12.h, z13.h\n"
"st1b { z8.b }, p1, [x11]\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "uzp1 z13.h, z14.h, z15.h\n"
- "uzp1 z12.b, z12.b, z13.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z1.s\n"
+ "smax z16.s, p2/M, z16.s, z1.s\n"
+ "uzp1 z0.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z0.b\n"
+ "smax z17.s, p2/M, z17.s, z1.s\n"
+ "smax z18.s, p2/M, z18.s, z1.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z12.b }, p1, [x24]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "st1b { z12.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z1.s\n"
+ "smax z20.s, p2/M, z20.s, z1.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z1.s\n"
+ "smax z22.s, p2/M, z22.s, z1.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
- "st1b { z16.b }, p1, [x23]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "st1b { z16.b }, p1, [x25]\n"
+ "smax z23.s, p2/M, z23.s, z1.s\n"
+ "smax z24.s, p2/M, z24.s, z1.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z1.s\n"
+ "smax z26.s, p2/M, z26.s, z1.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z20.b }, p1, [x22]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "smax z28.s, p2/M, z28.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "smax z29.s, p2/M, z29.s, z5.s\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
+ "st1b { z20.b }, p1, [x24]\n"
+ "smax z27.s, p2/M, z27.s, z1.s\n"
+ "smax z28.s, p2/M, z28.s, z1.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "smax z29.s, p2/M, z29.s, z1.s\n"
+ "smax z30.s, p2/M, z30.s, z1.s\n"
"uzp1 z28.h, z28.h, z29.h\n"
- "st1b { z24.b }, p1, [x21]\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p1, [x20]\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smax z31.s, p2/M, z31.s, z1.s\n"
+ "uzp1 z16.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z16.b\n"
+ "st1b { z28.b }, p1, [x22]\n"
"addvl x11, x11, #1\n"
"78:" // Height 6: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -2594,7 +2594,6 @@ void sve_hybrid_s8qs_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2602,4 +2601,4 @@ void sve_hybrid_s8qs_dot_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
index 2b7ad8bf4b..b1b1135c73 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, int8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -97,5 +96,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
index 6041794bdb..cd5f85411c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
@@ -117,11 +117,11 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -133,86 +133,86 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"ble 8f\n"
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n"
+ ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
"add x26, x26, #0x10\n"
"bgt 7b\n"
"8:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
"subs x27, x27, #0x8\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
"addvl x9, x9, #8\n"
"ble 9f\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
+ ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
+ ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
+ ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
+ ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
"addvl x9, x9, #8\n"
"9:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -221,18 +221,18 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"bne 4b\n"
"uzp1 z8.d, z8.d, z12.d\n"
"uzp1 z9.d, z9.d, z13.d\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x14]\n"
+ "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
"uzp1 z10.d, z10.d, z14.d\n"
"uzp1 z11.d, z11.d, z15.d\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
"mov z15.d, z8.d\n"
- "add z15.s, z15.s, z0.s\n"
+ "add z15.s, z15.s, z19.s\n"
"addvl x14, x14, #4\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
+ "add z9.s, z9.s, z18.s\n"
+ "add z10.s, z10.s, z17.s\n"
+ "add z11.s, z11.s, z16.s\n"
"tbz %x[flags], #4, 10f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -246,10 +246,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"addvl x13, x13, #4\n"
"b 11f\n"
"10:" // Height 1: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -262,44 +262,44 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
"tbz %x[flags], #5, 12f\n"
- "and z4.d, z15.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z15.s, z15.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
+ "and z19.d, z15.d, z0.d\n"
+ "and z18.d, z9.d, z1.d\n"
+ "and z17.d, z10.d, z2.d\n"
+ "and z16.d, z11.d, z3.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z19.s\n"
+ "sqadd z9.s, z9.s, z18.s\n"
+ "sqadd z10.s, z10.s, z17.s\n"
+ "sqadd z11.s, z11.s, z16.s\n"
"12:" // Height 1: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z15.s, z15.s, z4.s\n"
+ "add z15.s, z15.s, z17.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z17.s\n"
+ "add z10.s, z10.s, z17.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z11.s, z11.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "add z11.s, z11.s, z17.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z31.s }, p2/Z, [x20]\n"
+ "smin z15.s, p2/M, z15.s, z16.s\n"
+ "smin z9.s, p2/M, z9.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z16.s\n"
+ "smin z11.s, p2/M, z11.s, z16.s\n"
+ "smax z15.s, p2/M, z15.s, z31.s\n"
+ "smax z9.s, p2/M, z9.s, z31.s\n"
+ "smax z10.s, p2/M, z10.s, z31.s\n"
"uzp1 z15.h, z15.h, z9.h\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z15.b, z15.b, z9.b\n"
+ "smax z11.s, p2/M, z11.s, z31.s\n"
+ "uzp1 z16.h, z10.h, z11.h\n"
+ "uzp1 z15.b, z15.b, z16.b\n"
"st1b { z15.b }, p1, [x11]\n"
"addvl x11, x11, #1\n"
"13:" // Height 1: Writeback done
@@ -330,12 +330,12 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"17:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 18f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 19f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -343,125 +343,125 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"b 19f\n"
"18:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"19:" // Height 2: input setup done
"cmp x27, #0x10\n"
"ble 21f\n"
"20:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n"
+ ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"bgt 20b\n"
"21:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
"subs x27, x27, #0x8\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
"addvl x9, x9, #8\n"
"ble 22f\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ "ld1b { z17.b }, p2/Z, [x9]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
+ ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
+ ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
+ ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+ "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
+ ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
"addvl x9, x9, #8\n"
"22:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 17b\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "uzp1 z20.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
+ "ld1w { z19.s }, p2/Z, [x14]\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add x24, x11, x20\n"
+ "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add x26, x11, x20\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
"addvl x14, x14, #4\n"
- "mov z15.d, z7.d\n"
- "add z15.s, z15.s, z0.s\n"
- "add z12.s, z12.s, z1.s\n"
- "add z13.s, z13.s, z2.s\n"
- "add z14.s, z14.s, z3.s\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
+ "mov z15.d, z20.d\n"
+ "add z15.s, z15.s, z19.s\n"
+ "add z12.s, z12.s, z18.s\n"
+ "add z13.s, z13.s, z17.s\n"
+ "add z14.s, z14.s, z16.s\n"
+ "add z8.s, z8.s, z19.s\n"
+ "add z9.s, z9.s, z18.s\n"
+ "add z10.s, z10.s, z17.s\n"
+ "add z11.s, z11.s, z16.s\n"
"tbz %x[flags], #4, 23f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -475,10 +475,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"addvl x13, x13, #4\n"
"b 24f\n"
"23:" // Height 2: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -495,77 +495,77 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
"tbz %x[flags], #5, 25f\n"
- "and z4.d, z15.d, z0.d\n"
- "and z5.d, z12.d, z1.d\n"
- "and z6.d, z13.d, z2.d\n"
- "and z7.d, z14.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z15.s, z15.s, z4.s\n"
- "sqadd z12.s, z12.s, z5.s\n"
- "sqadd z13.s, z13.s, z6.s\n"
- "sqadd z14.s, z14.s, z7.s\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
+ "and z19.d, z15.d, z0.d\n"
+ "and z18.d, z12.d, z1.d\n"
+ "and z17.d, z13.d, z2.d\n"
+ "and z16.d, z14.d, z3.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z19.s\n"
+ "sqadd z12.s, z12.s, z18.s\n"
+ "sqadd z13.s, z13.s, z17.s\n"
+ "sqadd z14.s, z14.s, z16.s\n"
+ "and z18.d, z8.d, z0.d\n"
+ "and z24.d, z9.d, z1.d\n"
+ "and z17.d, z10.d, z2.d\n"
+ "and z16.d, z11.d, z3.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z18.s\n"
+ "sqadd z9.s, z9.s, z24.s\n"
+ "sqadd z10.s, z10.s, z17.s\n"
+ "sqadd z11.s, z11.s, z16.s\n"
"25:" // Height 2: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z15.s, z15.s, z4.s\n"
+ "add z15.s, z15.s, z17.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
- "add z12.s, z12.s, z4.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "add z12.s, z12.s, z17.s\n"
+ "add z13.s, z13.s, z17.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z14.s, z14.s, z4.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z14.s, z14.s, z17.s\n"
+ "add z8.s, z8.s, z17.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z17.s\n"
+ "add z10.s, z10.s, z17.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z11.s, z11.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "add z11.s, z11.s, z17.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z17.s }, p2/Z, [x20]\n"
+ "smin z15.s, p2/M, z15.s, z16.s\n"
+ "smin z12.s, p2/M, z12.s, z16.s\n"
+ "smin z13.s, p2/M, z13.s, z16.s\n"
+ "smin z14.s, p2/M, z14.s, z16.s\n"
+ "smin z8.s, p2/M, z8.s, z16.s\n"
+ "smin z9.s, p2/M, z9.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z16.s\n"
+ "smin z11.s, p2/M, z11.s, z16.s\n"
+ "smax z15.s, p2/M, z15.s, z17.s\n"
+ "smax z12.s, p2/M, z12.s, z17.s\n"
+ "smax z13.s, p2/M, z13.s, z17.s\n"
"uzp1 z15.h, z15.h, z12.h\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "uzp1 z12.h, z13.h, z14.h\n"
- "uzp1 z15.b, z15.b, z12.b\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z14.s, p2/M, z14.s, z17.s\n"
+ "smax z8.s, p2/M, z8.s, z17.s\n"
+ "uzp1 z16.h, z13.h, z14.h\n"
+ "uzp1 z15.b, z15.b, z16.b\n"
+ "smax z9.s, p2/M, z9.s, z17.s\n"
+ "smax z10.s, p2/M, z10.s, z17.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
"st1b { z15.b }, p1, [x11]\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x24]\n"
+ "smax z11.s, p2/M, z11.s, z17.s\n"
+ "uzp1 z16.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z16.b\n"
+ "st1b { z8.b }, p1, [x26]\n"
"addvl x11, x11, #1\n"
"26:" // Height 2: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -603,13 +603,13 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 31f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 32f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -618,174 +618,174 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"b 32f\n"
"31:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"32:" // Height 3: input setup done
"cmp x27, #0x10\n"
"ble 34f\n"
"33:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
+ ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
"cmp x27, #0x10\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
+ ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
+ ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
+ ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
+ ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
+ ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
+ ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
+ ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
+ ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 33b\n"
"34:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "trn1 z27.d, z1.d, z24.d\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
+ ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
+ ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 35f\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
+ ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
+ ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
+ ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
+ ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n"
+ ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n"
+ ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n"
+ ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n"
+ ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n"
+ ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n"
+ ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n"
"35:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 30b\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "uzp1 z28.d, z8.d, z12.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
+ "ld1w { z27.s }, p2/Z, [x14]\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add x24, x11, x20\n"
+ "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add x26, x11, x20\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x20\n"
"addvl x14, x14, #4\n"
"uzp1 z16.d, z16.d, z20.d\n"
"uzp1 z17.d, z17.d, z21.d\n"
"uzp1 z18.d, z18.d, z22.d\n"
"uzp1 z19.d, z19.d, z23.d\n"
- "mov z23.d, z7.d\n"
- "add z23.s, z23.s, z0.s\n"
- "add z12.s, z12.s, z1.s\n"
- "add z13.s, z13.s, z2.s\n"
- "add z14.s, z14.s, z3.s\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "mov z23.d, z28.d\n"
+ "add z23.s, z23.s, z27.s\n"
+ "add z12.s, z12.s, z26.s\n"
+ "add z13.s, z13.s, z25.s\n"
+ "add z14.s, z14.s, z24.s\n"
+ "add z8.s, z8.s, z27.s\n"
+ "add z9.s, z9.s, z26.s\n"
+ "add z10.s, z10.s, z25.s\n"
+ "add z11.s, z11.s, z24.s\n"
+ "add z16.s, z16.s, z27.s\n"
+ "add z17.s, z17.s, z26.s\n"
+ "add z18.s, z18.s, z25.s\n"
+ "add z19.s, z19.s, z24.s\n"
"tbz %x[flags], #4, 36f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -799,10 +799,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"addvl x13, x13, #4\n"
"b 37f\n"
"36:" // Height 3: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -823,109 +823,109 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
"tbz %x[flags], #5, 38f\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z12.d, z1.d\n"
- "and z6.d, z13.d, z2.d\n"
- "and z7.d, z14.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z12.s, z12.s, z5.s\n"
- "sqadd z13.s, z13.s, z6.s\n"
- "sqadd z14.s, z14.s, z7.s\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z24.d, z23.d, z0.d\n"
+ "and z22.d, z12.d, z1.d\n"
+ "and z21.d, z13.d, z2.d\n"
+ "and z20.d, z14.d, z3.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z24.s\n"
+ "sqadd z12.s, z12.s, z22.s\n"
+ "sqadd z13.s, z13.s, z21.s\n"
+ "sqadd z14.s, z14.s, z20.s\n"
+ "and z24.d, z8.d, z0.d\n"
+ "and z22.d, z9.d, z1.d\n"
+ "and z21.d, z10.d, z2.d\n"
+ "and z20.d, z11.d, z3.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z24.s\n"
+ "sqadd z9.s, z9.s, z22.s\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "sqadd z11.s, z11.s, z20.s\n"
+ "and z24.d, z16.d, z0.d\n"
+ "and z22.d, z17.d, z1.d\n"
+ "and z21.d, z18.d, z2.d\n"
+ "and z20.d, z19.d, z3.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z24.s\n"
+ "sqadd z17.s, z17.s, z22.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
"38:" // Height 3: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z23.s, z23.s, z21.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
- "add z12.s, z12.s, z4.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "add z12.s, z12.s, z21.s\n"
+ "add z13.s, z13.s, z21.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z14.s, z14.s, z4.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z14.s, z14.s, z21.s\n"
+ "add z8.s, z8.s, z21.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z21.s\n"
+ "add z10.s, z10.s, z21.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z11.s, z11.s, z21.s\n"
+ "add z16.s, z16.s, z21.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z21.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z21.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z20.s\n"
+ "smin z12.s, p2/M, z12.s, z20.s\n"
+ "smin z13.s, p2/M, z13.s, z20.s\n"
+ "smin z14.s, p2/M, z14.s, z20.s\n"
+ "smin z8.s, p2/M, z8.s, z20.s\n"
+ "smin z9.s, p2/M, z9.s, z20.s\n"
+ "smin z10.s, p2/M, z10.s, z20.s\n"
+ "smin z11.s, p2/M, z11.s, z20.s\n"
+ "smin z16.s, p2/M, z16.s, z20.s\n"
+ "smin z17.s, p2/M, z17.s, z20.s\n"
+ "smin z18.s, p2/M, z18.s, z20.s\n"
+ "smin z19.s, p2/M, z19.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z21.s\n"
+ "smax z12.s, p2/M, z12.s, z21.s\n"
+ "smax z13.s, p2/M, z13.s, z21.s\n"
"uzp1 z23.h, z23.h, z12.h\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "uzp1 z12.h, z13.h, z14.h\n"
- "uzp1 z23.b, z23.b, z12.b\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z14.s, p2/M, z14.s, z21.s\n"
+ "smax z8.s, p2/M, z8.s, z21.s\n"
+ "uzp1 z20.h, z13.h, z14.h\n"
+ "uzp1 z23.b, z23.b, z20.b\n"
+ "smax z9.s, p2/M, z9.s, z21.s\n"
+ "smax z10.s, p2/M, z10.s, z21.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
"st1b { z23.b }, p1, [x11]\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z21.s\n"
+ "smax z16.s, p2/M, z16.s, z21.s\n"
+ "uzp1 z20.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z20.b\n"
+ "smax z17.s, p2/M, z17.s, z21.s\n"
+ "smax z18.s, p2/M, z18.s, z21.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z8.b }, p1, [x24]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z19.s, p2/M, z19.s, z21.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x23]\n"
+ "st1b { z16.b }, p1, [x25]\n"
"addvl x11, x11, #1\n"
"39:" // Height 3: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -963,14 +963,14 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"43:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 44f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 45f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -980,161 +980,161 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"b 45f\n"
"44:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"45:" // Height 4: input setup done
"cmp x27, #0x10\n"
"ble 47f\n"
"46:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
"cmp x27, #0x10\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
+ ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
+ ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
+ ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
+ ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
+ ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
+ ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
+ ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
+ ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 46b\n"
"47:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199b88 // smmla z8.s, z28.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b8c // smmla z12.s, z28.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199b89 // smmla z9.s, z28.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x45189b8d // smmla z13.s, z28.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x45199b8a // smmla z10.s, z28.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45189b8e // smmla z14.s, z28.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
+ ".inst 0x45199b8b // smmla z11.s, z28.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ ".inst 0x45189b8f // smmla z15.s, z28.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 48f\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x9]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
+ ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
+ ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
+ ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
+ ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n"
+ ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n"
+ ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n"
+ ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n"
+ ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n"
+ ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n"
+ ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n"
"48:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 43b\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "uzp1 z28.d, z8.d, z12.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
+ "ld1w { z27.s }, p2/Z, [x14]\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add x24, x11, x20\n"
+ "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add x26, x11, x20\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x20\n"
+ "add x24, x25, x20\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
"addvl x14, x14, #4\n"
@@ -1144,23 +1144,23 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "mov z23.d, z7.d\n"
- "add z23.s, z23.s, z0.s\n"
- "add z12.s, z12.s, z1.s\n"
- "add z13.s, z13.s, z2.s\n"
- "add z14.s, z14.s, z3.s\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
- "add z15.s, z15.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "mov z23.d, z28.d\n"
+ "add z23.s, z23.s, z27.s\n"
+ "add z12.s, z12.s, z26.s\n"
+ "add z13.s, z13.s, z25.s\n"
+ "add z14.s, z14.s, z24.s\n"
+ "add z8.s, z8.s, z27.s\n"
+ "add z9.s, z9.s, z26.s\n"
+ "add z10.s, z10.s, z25.s\n"
+ "add z11.s, z11.s, z24.s\n"
+ "add z15.s, z15.s, z27.s\n"
+ "add z20.s, z20.s, z26.s\n"
+ "add z21.s, z21.s, z25.s\n"
+ "add z22.s, z22.s, z24.s\n"
+ "add z16.s, z16.s, z27.s\n"
+ "add z17.s, z17.s, z26.s\n"
+ "add z18.s, z18.s, z25.s\n"
+ "add z19.s, z19.s, z24.s\n"
"tbz %x[flags], #4, 49f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1174,10 +1174,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"addvl x13, x13, #4\n"
"b 50f\n"
"49:" // Height 4: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -1202,141 +1202,141 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
"tbz %x[flags], #5, 51f\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z12.d, z1.d\n"
- "and z6.d, z13.d, z2.d\n"
- "and z7.d, z14.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z12.s, z12.s, z5.s\n"
- "sqadd z13.s, z13.s, z6.s\n"
- "sqadd z14.s, z14.s, z7.s\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z15.d, z0.d\n"
- "and z5.d, z20.d, z1.d\n"
- "and z6.d, z21.d, z2.d\n"
- "and z7.d, z22.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z15.s, z15.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z27.d, z23.d, z0.d\n"
+ "and z26.d, z12.d, z1.d\n"
+ "and z25.d, z13.d, z2.d\n"
+ "and z24.d, z14.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z27.s\n"
+ "sqadd z12.s, z12.s, z26.s\n"
+ "sqadd z13.s, z13.s, z25.s\n"
+ "sqadd z14.s, z14.s, z24.s\n"
+ "and z27.d, z8.d, z0.d\n"
+ "and z26.d, z9.d, z1.d\n"
+ "and z25.d, z10.d, z2.d\n"
+ "and z24.d, z11.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z27.s\n"
+ "sqadd z9.s, z9.s, z26.s\n"
+ "sqadd z10.s, z10.s, z25.s\n"
+ "sqadd z11.s, z11.s, z24.s\n"
+ "and z27.d, z15.d, z0.d\n"
+ "and z26.d, z20.d, z1.d\n"
+ "and z25.d, z21.d, z2.d\n"
+ "and z24.d, z22.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z27.s\n"
+ "sqadd z20.s, z20.s, z26.s\n"
+ "sqadd z21.s, z21.s, z25.s\n"
+ "sqadd z22.s, z22.s, z24.s\n"
+ "and z27.d, z16.d, z0.d\n"
+ "and z26.d, z17.d, z1.d\n"
+ "and z25.d, z18.d, z2.d\n"
+ "and z24.d, z19.d, z3.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z27.s\n"
+ "sqadd z17.s, z17.s, z26.s\n"
+ "sqadd z18.s, z18.s, z25.s\n"
+ "sqadd z19.s, z19.s, z24.s\n"
"51:" // Height 4: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z23.s, z23.s, z25.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
- "add z12.s, z12.s, z4.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "add z12.s, z12.s, z25.s\n"
+ "add z13.s, z13.s, z25.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z14.s, z14.s, z4.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z14.s, z14.s, z25.s\n"
+ "add z8.s, z8.s, z25.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z25.s\n"
+ "add z10.s, z10.s, z25.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z15.s, z15.s, z4.s\n"
+ "add z11.s, z11.s, z25.s\n"
+ "add z15.s, z15.s, z25.s\n"
".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n"
".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z25.s\n"
+ "add z21.s, z21.s, z25.s\n"
".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z25.s\n"
+ "add z16.s, z16.s, z25.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z25.s\n"
+ "add z18.s, z18.s, z25.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z25.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z24.s\n"
+ "smin z12.s, p2/M, z12.s, z24.s\n"
+ "smin z13.s, p2/M, z13.s, z24.s\n"
+ "smin z14.s, p2/M, z14.s, z24.s\n"
+ "smin z8.s, p2/M, z8.s, z24.s\n"
+ "smin z9.s, p2/M, z9.s, z24.s\n"
+ "smin z10.s, p2/M, z10.s, z24.s\n"
+ "smin z11.s, p2/M, z11.s, z24.s\n"
+ "smin z15.s, p2/M, z15.s, z24.s\n"
+ "smin z20.s, p2/M, z20.s, z24.s\n"
+ "smin z21.s, p2/M, z21.s, z24.s\n"
+ "smin z22.s, p2/M, z22.s, z24.s\n"
+ "smin z16.s, p2/M, z16.s, z24.s\n"
+ "smin z17.s, p2/M, z17.s, z24.s\n"
+ "smin z18.s, p2/M, z18.s, z24.s\n"
+ "smin z19.s, p2/M, z19.s, z24.s\n"
+ "smax z23.s, p2/M, z23.s, z25.s\n"
+ "smax z12.s, p2/M, z12.s, z25.s\n"
+ "smax z13.s, p2/M, z13.s, z25.s\n"
"uzp1 z23.h, z23.h, z12.h\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "uzp1 z12.h, z13.h, z14.h\n"
- "uzp1 z23.b, z23.b, z12.b\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z14.s, p2/M, z14.s, z25.s\n"
+ "smax z8.s, p2/M, z8.s, z25.s\n"
+ "uzp1 z24.h, z13.h, z14.h\n"
+ "uzp1 z23.b, z23.b, z24.b\n"
+ "smax z9.s, p2/M, z9.s, z25.s\n"
+ "smax z10.s, p2/M, z10.s, z25.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
"st1b { z23.b }, p1, [x11]\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z25.s\n"
+ "smax z15.s, p2/M, z15.s, z25.s\n"
+ "uzp1 z23.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z23.b\n"
+ "smax z20.s, p2/M, z20.s, z25.s\n"
+ "smax z21.s, p2/M, z21.s, z25.s\n"
"uzp1 z15.h, z15.h, z20.h\n"
- "st1b { z8.b }, p1, [x24]\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z22.s, p2/M, z22.s, z25.s\n"
+ "smax z16.s, p2/M, z16.s, z25.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z15.b, z15.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z25.s\n"
+ "smax z18.s, p2/M, z18.s, z25.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z15.b }, p1, [x23]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z15.b }, p1, [x25]\n"
+ "smax z19.s, p2/M, z19.s, z25.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x22]\n"
+ "st1b { z16.b }, p1, [x24]\n"
"addvl x11, x11, #1\n"
"52:" // Height 4: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -1382,15 +1382,15 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"56:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 57f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1401,204 +1401,204 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"b 58f\n"
"57:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"58:" // Height 5: input setup done
"cmp x27, #0x10\n"
"ble 60f\n"
"59:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1rqb { z6.b }, p0/Z, [x26]\n"
+ "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z7.b }, p0/Z, [x24]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "trn1 z3.d, z7.d, z2.d\n"
+ "trn2 z7.d, z7.d, z2.d\n"
+ "ld1b { z1.b }, p2/Z, [x9]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x450198a8 // smmla z8.s, z5.b, z1.b\n"
+ ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n"
+ ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
+ ".inst 0x450098ac // smmla z12.s, z5.b, z0.b\n"
+ ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
+ ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x450198a9 // smmla z9.s, z5.b, z1.b\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n"
+ ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
+ ".inst 0x450098ad // smmla z13.s, z5.b, z0.b\n"
+ ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x450198aa // smmla z10.s, z5.b, z1.b\n"
+ ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n"
+ ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x450098ae // smmla z14.s, z5.b, z0.b\n"
+ ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x450198ab // smmla z11.s, z5.b, z1.b\n"
+ ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n"
+ ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ ".inst 0x450098af // smmla z15.s, z5.b, z0.b\n"
+ ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
+ ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
+ ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n"
+ ".inst 0x45019898 // smmla z24.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
+ ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
+ ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n"
+ ".inst 0x45019899 // smmla z25.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
+ ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
+ ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n"
+ ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
+ ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
+ ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n"
+ ".inst 0x4501989b // smmla z27.s, z4.b, z1.b\n"
+ ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
+ ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"bgt 59b\n"
"60:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
+ ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
+ ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n"
+ ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n"
+ ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n"
+ ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
+ ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
"addvl x9, x9, #8\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
+ ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
+ ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"ble 61f\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
+ ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
+ ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
+ ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+ ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
+ ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
+ ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
+ ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
+ ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
+ ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
+ ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n"
+ ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n"
+ ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n"
+ ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n"
+ ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
+ ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
"61:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 56b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "uzp1 z4.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "add x24, x11, x20\n"
+ "add x26, x11, x20\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14]\n"
+ "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x20\n"
+ "add x24, x25, x20\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x20\n"
"addvl x14, x14, #4\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
@@ -1610,27 +1610,27 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
- "mov z31.d, z7.d\n"
- "add z31.s, z31.s, z0.s\n"
- "add z12.s, z12.s, z1.s\n"
- "add z13.s, z13.s, z2.s\n"
- "add z14.s, z14.s, z3.s\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
- "add z15.s, z15.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
+ "mov z31.d, z4.d\n"
+ "add z31.s, z31.s, z3.s\n"
+ "add z12.s, z12.s, z2.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z14.s, z14.s, z0.s\n"
+ "add z8.s, z8.s, z3.s\n"
+ "add z9.s, z9.s, z2.s\n"
+ "add z10.s, z10.s, z1.s\n"
+ "add z11.s, z11.s, z0.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z20.s, z20.s, z2.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z0.s\n"
+ "add z16.s, z16.s, z3.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z1.s\n"
+ "add z19.s, z19.s, z0.s\n"
+ "add z24.s, z24.s, z3.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z1.s\n"
+ "add z27.s, z27.s, z0.s\n"
"tbz %x[flags], #4, 62f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1644,10 +1644,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"addvl x13, x13, #4\n"
"b 63f\n"
"62:" // Height 5: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -1676,173 +1676,173 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
"tbz %x[flags], #5, 64f\n"
- "and z4.d, z31.d, z0.d\n"
- "and z5.d, z12.d, z1.d\n"
- "and z6.d, z13.d, z2.d\n"
- "and z7.d, z14.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z31.s, z31.s, z4.s\n"
- "sqadd z12.s, z12.s, z5.s\n"
- "sqadd z13.s, z13.s, z6.s\n"
- "sqadd z14.s, z14.s, z7.s\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z15.d, z0.d\n"
- "and z5.d, z20.d, z1.d\n"
- "and z6.d, z21.d, z2.d\n"
- "and z7.d, z22.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z15.s, z15.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "and z4.d, z24.d, z0.d\n"
- "and z5.d, z25.d, z1.d\n"
- "and z6.d, z26.d, z2.d\n"
- "and z7.d, z27.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z24.s, z24.s, z4.s\n"
- "sqadd z25.s, z25.s, z5.s\n"
- "sqadd z26.s, z26.s, z6.s\n"
- "sqadd z27.s, z27.s, z7.s\n"
+ "and z30.d, z31.d, z0.d\n"
+ "and z29.d, z12.d, z1.d\n"
+ "and z28.d, z13.d, z2.d\n"
+ "and z23.d, z14.d, z3.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z30.s\n"
+ "sqadd z12.s, z12.s, z29.s\n"
+ "sqadd z13.s, z13.s, z28.s\n"
+ "sqadd z14.s, z14.s, z23.s\n"
+ "and z30.d, z8.d, z0.d\n"
+ "and z29.d, z9.d, z1.d\n"
+ "and z28.d, z10.d, z2.d\n"
+ "and z23.d, z11.d, z3.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z30.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ "sqadd z10.s, z10.s, z28.s\n"
+ "sqadd z11.s, z11.s, z23.s\n"
+ "and z30.d, z15.d, z0.d\n"
+ "and z29.d, z20.d, z1.d\n"
+ "and z28.d, z21.d, z2.d\n"
+ "and z23.d, z22.d, z3.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z30.s\n"
+ "sqadd z20.s, z20.s, z29.s\n"
+ "sqadd z21.s, z21.s, z28.s\n"
+ "sqadd z22.s, z22.s, z23.s\n"
+ "and z30.d, z16.d, z0.d\n"
+ "and z29.d, z17.d, z1.d\n"
+ "and z28.d, z18.d, z2.d\n"
+ "and z23.d, z19.d, z3.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z30.s\n"
+ "sqadd z17.s, z17.s, z29.s\n"
+ "sqadd z18.s, z18.s, z28.s\n"
+ "sqadd z19.s, z19.s, z23.s\n"
+ "and z30.d, z24.d, z0.d\n"
+ "and z29.d, z25.d, z1.d\n"
+ "and z28.d, z26.d, z2.d\n"
+ "and z23.d, z27.d, z3.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z30.s\n"
+ "sqadd z25.s, z25.s, z29.s\n"
+ "sqadd z26.s, z26.s, z28.s\n"
+ "sqadd z27.s, z27.s, z23.s\n"
"64:" // Height 5: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "add z31.s, z31.s, z4.s\n"
+ "add z31.s, z31.s, z28.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n"
- "add z12.s, z12.s, z4.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "add z12.s, z12.s, z28.s\n"
+ "add z13.s, z13.s, z28.s\n"
".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n"
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add z14.s, z14.s, z4.s\n"
- "add z8.s, z8.s, z4.s\n"
+ "add z14.s, z14.s, z28.s\n"
+ "add z8.s, z8.s, z28.s\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "add z9.s, z9.s, z4.s\n"
- "add z10.s, z10.s, z4.s\n"
+ "add z9.s, z9.s, z28.s\n"
+ "add z10.s, z10.s, z28.s\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n"
- "add z11.s, z11.s, z4.s\n"
- "add z15.s, z15.s, z4.s\n"
+ "add z11.s, z11.s, z28.s\n"
+ "add z15.s, z15.s, z28.s\n"
".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n"
".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z28.s\n"
+ "add z21.s, z21.s, z28.s\n"
".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z28.s\n"
+ "add z16.s, z16.s, z28.s\n"
".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z28.s\n"
+ "add z18.s, z18.s, z28.s\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z24.s, z24.s, z28.s\n"
".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z28.s\n"
+ "add z26.s, z26.s, z28.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
- "add z27.s, z27.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z28.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ "smin z31.s, p2/M, z31.s, z23.s\n"
+ "smin z12.s, p2/M, z12.s, z23.s\n"
+ "smin z13.s, p2/M, z13.s, z23.s\n"
+ "smin z14.s, p2/M, z14.s, z23.s\n"
+ "smin z8.s, p2/M, z8.s, z23.s\n"
+ "smin z9.s, p2/M, z9.s, z23.s\n"
+ "smin z10.s, p2/M, z10.s, z23.s\n"
+ "smin z11.s, p2/M, z11.s, z23.s\n"
+ "smin z15.s, p2/M, z15.s, z23.s\n"
+ "smin z20.s, p2/M, z20.s, z23.s\n"
+ "smin z21.s, p2/M, z21.s, z23.s\n"
+ "smin z22.s, p2/M, z22.s, z23.s\n"
+ "smin z16.s, p2/M, z16.s, z23.s\n"
+ "smin z17.s, p2/M, z17.s, z23.s\n"
+ "smin z18.s, p2/M, z18.s, z23.s\n"
+ "smin z19.s, p2/M, z19.s, z23.s\n"
+ "smin z24.s, p2/M, z24.s, z23.s\n"
+ "smin z25.s, p2/M, z25.s, z23.s\n"
+ "smin z26.s, p2/M, z26.s, z23.s\n"
+ "smin z27.s, p2/M, z27.s, z23.s\n"
+ "smax z31.s, p2/M, z31.s, z28.s\n"
+ "smax z12.s, p2/M, z12.s, z28.s\n"
+ "smax z13.s, p2/M, z13.s, z28.s\n"
"uzp1 z31.h, z31.h, z12.h\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "uzp1 z12.h, z13.h, z14.h\n"
- "uzp1 z31.b, z31.b, z12.b\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z14.s, p2/M, z14.s, z28.s\n"
+ "smax z8.s, p2/M, z8.s, z28.s\n"
+ "uzp1 z23.h, z13.h, z14.h\n"
+ "uzp1 z31.b, z31.b, z23.b\n"
+ "smax z9.s, p2/M, z9.s, z28.s\n"
+ "smax z10.s, p2/M, z10.s, z28.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
"st1b { z31.b }, p1, [x11]\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z28.s\n"
+ "smax z15.s, p2/M, z15.s, z28.s\n"
+ "uzp1 z23.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z23.b\n"
+ "smax z20.s, p2/M, z20.s, z28.s\n"
+ "smax z21.s, p2/M, z21.s, z28.s\n"
"uzp1 z15.h, z15.h, z20.h\n"
- "st1b { z8.b }, p1, [x24]\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z22.s, p2/M, z22.s, z28.s\n"
+ "smax z16.s, p2/M, z16.s, z28.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z15.b, z15.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z28.s\n"
+ "smax z18.s, p2/M, z18.s, z28.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z15.b }, p1, [x23]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
+ "st1b { z15.b }, p1, [x25]\n"
+ "smax z19.s, p2/M, z19.s, z28.s\n"
+ "smax z24.s, p2/M, z24.s, z28.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z28.s\n"
+ "smax z26.s, p2/M, z26.s, z28.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z16.b }, p1, [x22]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x21]\n"
+ "st1b { z16.b }, p1, [x24]\n"
+ "smax z27.s, p2/M, z27.s, z28.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
"addvl x11, x11, #1\n"
"65:" // Height 5: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -1891,16 +1891,16 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"69:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 70f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 71f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1912,209 +1912,209 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"b 71f\n"
"70:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"71:" // Height 6: input setup done
"cmp x27, #0x10\n"
"ble 73f\n"
"72:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1b { z1.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
+ ".inst 0x45019890 // smmla z16.s, z4.b, z1.b\n"
+ ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
+ ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
+ ".inst 0x45009894 // smmla z20.s, z4.b, z0.b\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
+ ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x45019891 // smmla z17.s, z4.b, z1.b\n"
+ ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
+ ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
+ ".inst 0x45009895 // smmla z21.s, z4.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
+ ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
"add x21, x21, #0x10\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x45019892 // smmla z18.s, z4.b, z1.b\n"
+ ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
+ ".inst 0x45009896 // smmla z22.s, z4.b, z0.b\n"
+ ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
+ ".inst 0x45019893 // smmla z19.s, z4.b, z1.b\n"
+ ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+ ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
+ ".inst 0x45009897 // smmla z23.s, z4.b, z0.b\n"
+ ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+ ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
+ ".inst 0x450198b0 // smmla z16.s, z5.b, z1.b\n"
+ ".inst 0x45019878 // smmla z24.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098b4 // smmla z20.s, z5.b, z0.b\n"
+ ".inst 0x4500987c // smmla z28.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+ ".inst 0x450198e9 // smmla z9.s, z7.b, z1.b\n"
+ ".inst 0x450198b1 // smmla z17.s, z5.b, z1.b\n"
+ ".inst 0x45019879 // smmla z25.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098b5 // smmla z21.s, z5.b, z0.b\n"
+ ".inst 0x4500987d // smmla z29.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+ ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n"
+ ".inst 0x450198b2 // smmla z18.s, z5.b, z1.b\n"
+ ".inst 0x4501987a // smmla z26.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098b6 // smmla z22.s, z5.b, z0.b\n"
+ ".inst 0x4500987e // smmla z30.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+ ".inst 0x450198eb // smmla z11.s, z7.b, z1.b\n"
+ ".inst 0x450198b3 // smmla z19.s, z5.b, z1.b\n"
+ ".inst 0x4501987b // smmla z27.s, z3.b, z1.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n"
+ ".inst 0x4500987f // smmla z31.s, z3.b, z0.b\n"
"bgt 72b\n"
"73:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
+ ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
+ ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n"
+ ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n"
+ ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n"
+ ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
+ ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+ ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
"addvl x9, x9, #8\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
+ ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
+ ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"ble 74f\n"
- "ld1b { z7.b }, p2/Z, [x9]\n"
- "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+ "ld1b { z2.b }, p2/Z, [x9]\n"
+ "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
+ ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
+ ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
+ ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+ ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+ ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
+ ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
+ ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
+ ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+ ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+ ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
+ ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
+ ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
+ ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+ ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
"addvl x9, x9, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n"
+ ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n"
+ ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n"
+ ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n"
+ ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
+ ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
"74:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 69b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z7.d, z8.d, z12.d\n"
- "add x24, x11, x20\n"
+ "uzp1 z4.d, z8.d, z12.d\n"
+ "add x26, x11, x20\n"
"uzp2 z8.d, z8.d, z12.d\n"
"uzp1 z12.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "add x23, x24, x20\n"
- "ld1w { z0.s }, p2/Z, [x14]\n"
+ "add x25, x26, x20\n"
+ "ld1w { z3.s }, p2/Z, [x14]\n"
"uzp1 z13.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
"uzp1 z14.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
- "add x22, x23, x20\n"
+ "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+ "add x24, x25, x20\n"
"uzp1 z15.d, z16.d, z20.d\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x21, x22, x20\n"
- "add x20, x21, x20\n"
+ "add x23, x24, x20\n"
+ "add x22, x23, x20\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"addvl x14, x14, #4\n"
@@ -2130,31 +2130,31 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"uzp2 z26.d, z26.d, z30.d\n"
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
- "mov z31.d, z7.d\n"
- "add z31.s, z31.s, z0.s\n"
- "add z12.s, z12.s, z1.s\n"
- "add z13.s, z13.s, z2.s\n"
- "add z14.s, z14.s, z3.s\n"
- "add z8.s, z8.s, z0.s\n"
- "add z9.s, z9.s, z1.s\n"
- "add z10.s, z10.s, z2.s\n"
- "add z11.s, z11.s, z3.s\n"
- "add z15.s, z15.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z23.s, z23.s, z0.s\n"
- "add z28.s, z28.s, z1.s\n"
- "add z29.s, z29.s, z2.s\n"
- "add z30.s, z30.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
+ "mov z31.d, z4.d\n"
+ "add z31.s, z31.s, z3.s\n"
+ "add z12.s, z12.s, z2.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z14.s, z14.s, z0.s\n"
+ "add z8.s, z8.s, z3.s\n"
+ "add z9.s, z9.s, z2.s\n"
+ "add z10.s, z10.s, z1.s\n"
+ "add z11.s, z11.s, z0.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z20.s, z20.s, z2.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z0.s\n"
+ "add z16.s, z16.s, z3.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z1.s\n"
+ "add z19.s, z19.s, z0.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z0.s\n"
+ "add z24.s, z24.s, z3.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z1.s\n"
+ "add z27.s, z27.s, z0.s\n"
"tbz %x[flags], #4, 75f\n"
"ld1w { z0.s }, p2/Z, [x12]\n"
"ld1w { z4.s }, p2/Z, [x13]\n"
@@ -2168,10 +2168,10 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"addvl x13, x13, #4\n"
"b 76f\n"
"75:" // Height 6: per layer parameters
- "add x26, %x[qp], %[per_layer_right_shift]\n"
- "add x25, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z0.s }, p2/Z, [x26]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x21, %x[qp], %[per_layer_right_shift]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z0.s }, p2/Z, [x21]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
"mov z1.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z2.d, z0.d\n"
@@ -2204,81 +2204,81 @@ void sve_hybrid_s8qs_mmla_6x4VL (
".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
"tbz %x[flags], #5, 77f\n"
- "and z4.d, z31.d, z0.d\n"
- "and z5.d, z12.d, z1.d\n"
- "and z6.d, z13.d, z2.d\n"
- "and z7.d, z14.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "and z7.d, z31.d, z0.d\n"
+ "and z6.d, z12.d, z1.d\n"
+ "and z5.d, z13.d, z2.d\n"
+ "and z4.d, z14.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z31.s, z31.s, z4.s\n"
- "sqadd z12.s, z12.s, z5.s\n"
- "sqadd z13.s, z13.s, z6.s\n"
- "sqadd z14.s, z14.s, z7.s\n"
- "and z4.d, z8.d, z0.d\n"
- "and z5.d, z9.d, z1.d\n"
- "and z6.d, z10.d, z2.d\n"
- "and z7.d, z11.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z8.s, z8.s, z4.s\n"
- "sqadd z9.s, z9.s, z5.s\n"
- "sqadd z10.s, z10.s, z6.s\n"
- "sqadd z11.s, z11.s, z7.s\n"
- "and z4.d, z15.d, z0.d\n"
- "and z5.d, z20.d, z1.d\n"
- "and z6.d, z21.d, z2.d\n"
- "and z7.d, z22.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z15.s, z15.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z1.d\n"
- "and z6.d, z18.d, z2.d\n"
- "and z7.d, z19.d, z3.d\n"
"asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z7.s\n"
+ "sqadd z12.s, z12.s, z6.s\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "sqadd z14.s, z14.s, z4.s\n"
+ "and z7.d, z8.d, z0.d\n"
+ "and z6.d, z9.d, z1.d\n"
+ "and z5.d, z10.d, z2.d\n"
+ "and z4.d, z11.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z28.d, z1.d\n"
- "and z6.d, z29.d, z2.d\n"
- "and z7.d, z30.d, z3.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z7.s\n"
+ "sqadd z9.s, z9.s, z6.s\n"
+ "sqadd z10.s, z10.s, z5.s\n"
+ "sqadd z11.s, z11.s, z4.s\n"
+ "and z7.d, z15.d, z0.d\n"
+ "and z6.d, z20.d, z1.d\n"
+ "and z5.d, z21.d, z2.d\n"
+ "and z4.d, z22.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z28.s, z28.s, z5.s\n"
- "sqadd z29.s, z29.s, z6.s\n"
- "sqadd z30.s, z30.s, z7.s\n"
- "and z4.d, z24.d, z0.d\n"
- "and z5.d, z25.d, z1.d\n"
- "and z6.d, z26.d, z2.d\n"
- "and z7.d, z27.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "sqadd z20.s, z20.s, z6.s\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "sqadd z22.s, z22.s, z4.s\n"
+ "and z7.d, z16.d, z0.d\n"
+ "and z6.d, z17.d, z1.d\n"
+ "and z5.d, z18.d, z2.d\n"
+ "and z4.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z7.s\n"
+ "sqadd z17.s, z17.s, z6.s\n"
+ "sqadd z18.s, z18.s, z5.s\n"
+ "sqadd z19.s, z19.s, z4.s\n"
+ "and z7.d, z23.d, z0.d\n"
+ "and z6.d, z28.d, z1.d\n"
+ "and z5.d, z29.d, z2.d\n"
+ "and z4.d, z30.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "sqadd z28.s, z28.s, z6.s\n"
+ "sqadd z29.s, z29.s, z5.s\n"
+ "sqadd z30.s, z30.s, z4.s\n"
+ "and z7.d, z24.d, z0.d\n"
+ "and z6.d, z25.d, z1.d\n"
+ "and z5.d, z26.d, z2.d\n"
+ "and z4.d, z27.d, z3.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "sqadd z24.s, z24.s, z4.s\n"
- "sqadd z25.s, z25.s, z5.s\n"
- "sqadd z26.s, z26.s, z6.s\n"
- "sqadd z27.s, z27.s, z7.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z7.s\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "sqadd z26.s, z26.s, z5.s\n"
+ "sqadd z27.s, z27.s, z4.s\n"
"77:" // Height 6: no shift correction
- "add x25, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"add z31.s, z31.s, z4.s\n"
".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n"
@@ -2326,83 +2326,83 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"add z25.s, z25.s, z4.s\n"
"add z26.s, z26.s, z4.s\n"
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
- "add x25, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x25]\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
"add z27.s, z27.s, z4.s\n"
- "add x25, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x25]\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z12.s, p2/M, z12.s, z6.s\n"
- "smin z13.s, p2/M, z13.s, z6.s\n"
- "smin z14.s, p2/M, z14.s, z6.s\n"
- "smin z8.s, p2/M, z8.s, z6.s\n"
- "smin z9.s, p2/M, z9.s, z6.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smin z11.s, p2/M, z11.s, z6.s\n"
- "smin z15.s, p2/M, z15.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "smax z12.s, p2/M, z12.s, z5.s\n"
- "smax z13.s, p2/M, z13.s, z5.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "smin z31.s, p2/M, z31.s, z0.s\n"
+ "smin z12.s, p2/M, z12.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z0.s\n"
+ "smin z14.s, p2/M, z14.s, z0.s\n"
+ "smin z8.s, p2/M, z8.s, z0.s\n"
+ "smin z9.s, p2/M, z9.s, z0.s\n"
+ "smin z10.s, p2/M, z10.s, z0.s\n"
+ "smin z11.s, p2/M, z11.s, z0.s\n"
+ "smin z15.s, p2/M, z15.s, z0.s\n"
+ "smin z20.s, p2/M, z20.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z0.s\n"
+ "smin z22.s, p2/M, z22.s, z0.s\n"
+ "smin z16.s, p2/M, z16.s, z0.s\n"
+ "smin z17.s, p2/M, z17.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z0.s\n"
+ "smin z19.s, p2/M, z19.s, z0.s\n"
+ "smin z23.s, p2/M, z23.s, z0.s\n"
+ "smin z28.s, p2/M, z28.s, z0.s\n"
+ "smin z29.s, p2/M, z29.s, z0.s\n"
+ "smin z30.s, p2/M, z30.s, z0.s\n"
+ "smin z24.s, p2/M, z24.s, z0.s\n"
+ "smin z25.s, p2/M, z25.s, z0.s\n"
+ "smin z26.s, p2/M, z26.s, z0.s\n"
+ "smin z27.s, p2/M, z27.s, z0.s\n"
+ "smax z31.s, p2/M, z31.s, z1.s\n"
+ "smax z12.s, p2/M, z12.s, z1.s\n"
+ "smax z13.s, p2/M, z13.s, z1.s\n"
"uzp1 z31.h, z31.h, z12.h\n"
- "smax z14.s, p2/M, z14.s, z5.s\n"
- "smax z8.s, p2/M, z8.s, z5.s\n"
- "uzp1 z12.h, z13.h, z14.h\n"
- "uzp1 z31.b, z31.b, z12.b\n"
- "smax z9.s, p2/M, z9.s, z5.s\n"
- "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z14.s, p2/M, z14.s, z1.s\n"
+ "smax z8.s, p2/M, z8.s, z1.s\n"
+ "uzp1 z0.h, z13.h, z14.h\n"
+ "uzp1 z31.b, z31.b, z0.b\n"
+ "smax z9.s, p2/M, z9.s, z1.s\n"
+ "smax z10.s, p2/M, z10.s, z1.s\n"
"uzp1 z8.h, z8.h, z9.h\n"
"st1b { z31.b }, p1, [x11]\n"
- "smax z11.s, p2/M, z11.s, z5.s\n"
- "smax z15.s, p2/M, z15.s, z5.s\n"
- "uzp1 z9.h, z10.h, z11.h\n"
- "uzp1 z8.b, z8.b, z9.b\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z1.s\n"
+ "smax z15.s, p2/M, z15.s, z1.s\n"
+ "uzp1 z31.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z31.b\n"
+ "smax z20.s, p2/M, z20.s, z1.s\n"
+ "smax z21.s, p2/M, z21.s, z1.s\n"
"uzp1 z15.h, z15.h, z20.h\n"
- "st1b { z8.b }, p1, [x24]\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "st1b { z8.b }, p1, [x26]\n"
+ "smax z22.s, p2/M, z22.s, z1.s\n"
+ "smax z16.s, p2/M, z16.s, z1.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z15.b, z15.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z1.s\n"
+ "smax z18.s, p2/M, z18.s, z1.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "st1b { z15.b }, p1, [x23]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
+ "st1b { z15.b }, p1, [x25]\n"
+ "smax z19.s, p2/M, z19.s, z1.s\n"
+ "smax z23.s, p2/M, z23.s, z1.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z28.s, p2/M, z28.s, z5.s\n"
- "smax z29.s, p2/M, z29.s, z5.s\n"
+ "smax z28.s, p2/M, z28.s, z1.s\n"
+ "smax z29.s, p2/M, z29.s, z1.s\n"
"uzp1 z23.h, z23.h, z28.h\n"
- "st1b { z16.b }, p1, [x22]\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z28.h, z29.h, z30.h\n"
- "uzp1 z23.b, z23.b, z28.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "st1b { z16.b }, p1, [x24]\n"
+ "smax z30.s, p2/M, z30.s, z1.s\n"
+ "smax z24.s, p2/M, z24.s, z1.s\n"
+ "uzp1 z16.h, z29.h, z30.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z1.s\n"
+ "smax z26.s, p2/M, z26.s, z1.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z23.b }, p1, [x21]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x20]\n"
+ "st1b { z23.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z1.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x22]\n"
"addvl x11, x11, #1\n"
"78:" // Height 6: Writeback done
"decw x10, ALL, MUL #4\n"
@@ -2420,7 +2420,6 @@ void sve_hybrid_s8qs_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"80:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2428,4 +2427,4 @@ void sve_hybrid_s8qs_mmla_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index 28057aa961..cfa349f3aa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -39,6 +39,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+void sve_hybrid_s8s32_dot_6x4VL_a64fx( ARGLIST );
class cls_sve_hybrid_s8s32_dot_6x4VL
{
@@ -74,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, int32_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -83,10 +83,11 @@ public:
return { 20.92 };
case CPUModel::V1:
return { 62.24 };
+ case CPUModel::A64FX:
+ return { 94.32 };
}
}
-
if (std::is_same<T, int8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -95,6 +96,8 @@ public:
return { 22.77, 3.90, 0.47 };
case CPUModel::V1:
return { 48.09, 16.24, 0.83 };
+ case CPUModel::A64FX:
+ return { 100.19, 3.13, 0.43 };
}
}
@@ -103,13 +106,19 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
- cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+ cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A64FX:
+ kernel=sve_hybrid_s8s32_dot_6x4VL_a64fx;
+ break;
+ }
}
};
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
index 51e9aa1b40..1a483210f3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
@@ -115,11 +115,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 7f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -135,12 +135,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"8:" // Height 1: Multiply loop: Main loop
"sdot z8.s, z6.b, z0.b\n"
"sdot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z11.s, z7.b, z0.b\n"
+ "sdot z10.s, z17.b, z0.b\n"
+ "sdot z11.s, z16.b, z0.b\n"
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
@@ -150,12 +150,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"sdot z8.s, z6.b, z0.b\n"
"sdot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z11.s, z7.b, z0.b\n"
+ "sdot z10.s, z17.b, z0.b\n"
+ "sdot z11.s, z16.b, z0.b\n"
"addvl x10, x10, #4\n"
"bne 5b\n"
"st1w { z8.s }, p3, [x9]\n"
@@ -183,15 +183,15 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 13f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x20]\n"
+ "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 14f\n"
"13:" // Height 2: no accumulate
"mov z8.s, #0x0\n"
@@ -207,12 +207,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"15:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -220,7 +220,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"b 17f\n"
"16:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"17:" // Height 2: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -231,18 +231,18 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"18:" // Height 2: Multiply loop: Main loop
"sdot z8.s, z6.b, z0.b\n"
"sdot z12.s, z6.b, z1.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x26, x26, #0x4\n"
"sdot z9.s, z7.b, z0.b\n"
"sdot z13.s, z7.b, z1.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"subs x27, x27, #0x4\n"
"add x25, x25, #0x4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z11.s, z7.b, z0.b\n"
- "sdot z15.s, z7.b, z1.b\n"
+ "sdot z10.s, z17.b, z0.b\n"
+ "sdot z14.s, z17.b, z1.b\n"
+ "sdot z11.s, z16.b, z0.b\n"
+ "sdot z15.s, z16.b, z1.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
@@ -252,29 +252,29 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"sdot z8.s, z6.b, z0.b\n"
"sdot z12.s, z6.b, z1.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b\n"
"sdot z13.s, z7.b, z1.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
+ "sdot z10.s, z17.b, z0.b\n"
+ "sdot z14.s, z17.b, z1.b\n"
"addvl x10, x10, #4\n"
- "sdot z11.s, z7.b, z0.b\n"
- "sdot z15.s, z7.b, z1.b\n"
+ "sdot z11.s, z16.b, z0.b\n"
+ "sdot z15.s, z16.b, z1.b\n"
"bne 15b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x20]\n"
+ "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
"20:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -295,20 +295,20 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x23]\n"
- "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x21]\n"
+ "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x20]\n"
+ "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 24f\n"
"23:" // Height 3: no accumulate
"mov z8.s, #0x0\n"
@@ -328,13 +328,13 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"25:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 26f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 27f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -343,8 +343,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"b 27f\n"
"26:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"27:" // Height 3: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -360,21 +360,21 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"subs x27, x27, #0x4\n"
"sdot z16.s, z6.b, z2.b\n"
"sdot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x24, x24, #0x4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z18.s, z6.b, z2.b\n"
- "sdot z11.s, z7.b, z0.b\n"
+ "sdot z10.s, z21.b, z0.b\n"
+ "sdot z14.s, z21.b, z1.b\n"
+ "sdot z18.s, z21.b, z2.b\n"
+ "sdot z11.s, z20.b, z0.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
- "sdot z15.s, z7.b, z1.b\n"
- "sdot z19.s, z7.b, z2.b\n"
+ "sdot z15.s, z20.b, z1.b\n"
+ "sdot z19.s, z20.b, z2.b\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -386,35 +386,35 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"sdot z16.s, z6.b, z2.b\n"
"sdot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z18.s, z6.b, z2.b\n"
- "sdot z11.s, z7.b, z0.b\n"
- "sdot z15.s, z7.b, z1.b\n"
- "sdot z19.s, z7.b, z2.b\n"
+ "sdot z10.s, z21.b, z0.b\n"
+ "sdot z14.s, z21.b, z1.b\n"
+ "sdot z18.s, z21.b, z2.b\n"
+ "sdot z11.s, z20.b, z0.b\n"
+ "sdot z15.s, z20.b, z1.b\n"
+ "sdot z19.s, z20.b, z2.b\n"
"bne 25b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x23]\n"
- "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x21]\n"
+ "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x20]\n"
+ "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
"30:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -435,25 +435,25 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x23]\n"
- "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x22]\n"
- "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x22]\n"
+ "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x21]\n"
+ "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x20]\n"
+ "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 34f\n"
"33:" // Height 4: no accumulate
"mov z8.s, #0x0\n"
@@ -477,14 +477,14 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"35:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 37f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -494,9 +494,9 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"b 37f\n"
"36:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"37:" // Height 4: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -513,7 +513,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"subs x27, x27, #0x4\n"
"sdot z16.s, z6.b, z2.b\n"
"sdot z20.s, z6.b, z3.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"sdot z9.s, z7.b, z0.b\n"
"sdot z13.s, z7.b, z1.b\n"
@@ -521,19 +521,19 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x23, x23, #0x4\n"
"sdot z17.s, z7.b, z2.b\n"
"sdot z21.s, z7.b, z3.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z18.s, z6.b, z2.b\n"
- "sdot z22.s, z6.b, z3.b\n"
+ "sdot z10.s, z25.b, z0.b\n"
+ "sdot z14.s, z25.b, z1.b\n"
+ "sdot z18.s, z25.b, z2.b\n"
+ "sdot z22.s, z25.b, z3.b\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
- "sdot z11.s, z7.b, z0.b\n"
- "sdot z15.s, z7.b, z1.b\n"
+ "sdot z11.s, z24.b, z0.b\n"
+ "sdot z15.s, z24.b, z1.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
- "sdot z19.s, z7.b, z2.b\n"
- "sdot z23.s, z7.b, z3.b\n"
+ "sdot z19.s, z24.b, z2.b\n"
+ "sdot z23.s, z24.b, z3.b\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -545,44 +545,44 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"sdot z16.s, z6.b, z2.b\n"
"sdot z20.s, z6.b, z3.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"sdot z9.s, z7.b, z0.b\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
"sdot z21.s, z7.b, z3.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z18.s, z6.b, z2.b\n"
- "sdot z22.s, z6.b, z3.b\n"
- "sdot z11.s, z7.b, z0.b\n"
- "sdot z15.s, z7.b, z1.b\n"
- "sdot z19.s, z7.b, z2.b\n"
- "sdot z23.s, z7.b, z3.b\n"
+ "sdot z10.s, z25.b, z0.b\n"
+ "sdot z14.s, z25.b, z1.b\n"
+ "sdot z18.s, z25.b, z2.b\n"
+ "sdot z22.s, z25.b, z3.b\n"
+ "sdot z11.s, z24.b, z0.b\n"
+ "sdot z15.s, z24.b, z1.b\n"
+ "sdot z19.s, z24.b, z2.b\n"
+ "sdot z23.s, z24.b, z3.b\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x23]\n"
- "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x22]\n"
- "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x22]\n"
+ "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x21]\n"
+ "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x20]\n"
+ "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
"40:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -603,30 +603,30 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x23]\n"
- "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x22]\n"
- "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1w { z24.s }, p3/Z, [x21]\n"
- "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x23]\n"
+ "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x22]\n"
+ "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x21]\n"
+ "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p3/Z, [x20]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 44f\n"
"43:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
@@ -654,15 +654,15 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"45:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -673,10 +673,10 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"b 47f\n"
"46:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"47:" // Height 5: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -698,29 +698,29 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"add x24, x24, #0x4\n"
"sdot z24.s, z6.b, z4.b\n"
"sdot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x4\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
"add x22, x22, #0x4\n"
"sdot z21.s, z7.b, z3.b\n"
"sdot z25.s, z7.b, z4.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z18.s, z6.b, z2.b\n"
- "sdot z22.s, z6.b, z3.b\n"
- "sdot z26.s, z6.b, z4.b\n"
- "sdot z11.s, z7.b, z0.b\n"
+ "sdot z10.s, z29.b, z0.b\n"
+ "sdot z14.s, z29.b, z1.b\n"
+ "sdot z18.s, z29.b, z2.b\n"
+ "sdot z22.s, z29.b, z3.b\n"
+ "sdot z26.s, z29.b, z4.b\n"
+ "sdot z11.s, z28.b, z0.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
- "sdot z15.s, z7.b, z1.b\n"
- "sdot z19.s, z7.b, z2.b\n"
+ "sdot z15.s, z28.b, z1.b\n"
+ "sdot z19.s, z28.b, z2.b\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "sdot z23.s, z7.b, z3.b\n"
- "sdot z27.s, z7.b, z4.b\n"
+ "sdot z23.s, z28.b, z3.b\n"
+ "sdot z27.s, z28.b, z4.b\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -735,50 +735,50 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"cmp x28, x20\n"
"sdot z24.s, z6.b, z4.b\n"
"sdot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"sdot z13.s, z7.b, z1.b\n"
"sdot z17.s, z7.b, z2.b\n"
"sdot z21.s, z7.b, z3.b\n"
"sdot z25.s, z7.b, z4.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b\n"
- "sdot z14.s, z6.b, z1.b\n"
- "sdot z18.s, z6.b, z2.b\n"
- "sdot z22.s, z6.b, z3.b\n"
- "sdot z26.s, z6.b, z4.b\n"
- "sdot z11.s, z7.b, z0.b\n"
- "sdot z15.s, z7.b, z1.b\n"
- "sdot z19.s, z7.b, z2.b\n"
- "sdot z23.s, z7.b, z3.b\n"
- "sdot z27.s, z7.b, z4.b\n"
+ "sdot z10.s, z29.b, z0.b\n"
+ "sdot z14.s, z29.b, z1.b\n"
+ "sdot z18.s, z29.b, z2.b\n"
+ "sdot z22.s, z29.b, z3.b\n"
+ "sdot z26.s, z29.b, z4.b\n"
+ "sdot z11.s, z28.b, z0.b\n"
+ "sdot z15.s, z28.b, z1.b\n"
+ "sdot z19.s, z28.b, z2.b\n"
+ "sdot z23.s, z28.b, z3.b\n"
+ "sdot z27.s, z28.b, z4.b\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "st1w { z8.s }, p3, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x23]\n"
- "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x22]\n"
- "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p3, [x21]\n"
- "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
- "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
- "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x23]\n"
+ "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x22]\n"
+ "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x21]\n"
+ "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p3, [x20]\n"
+ "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
"50:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -862,16 +862,16 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"55:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 56f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -883,11 +883,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"b 57f\n"
"56:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"57:" // Height 6: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1022,7 +1022,6 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"62:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1030,4 +1029,4 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
index b3d2e6b271..eeef192b66 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -115,11 +115,11 @@ void sve_hybrid_s8s32_dot_6x4VL (
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 7f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -132,87 +132,87 @@ void sve_hybrid_s8s32_dot_6x4VL (
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z11.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "sdot z8.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sdot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z10.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sdot z11.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[2]\n"
+ "sdot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[2]\n"
+ "sdot z11.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[3]\n"
+ "sdot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z10.s, z17.b, z0.b[3]\n"
+ "sdot z11.s, z16.b, z0.b[3]\n"
"add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[0]\n"
+ "sdot z11.s, z16.b, z0.b[0]\n"
"addvl x10, x10, #4\n"
"ble 10f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[1]\n"
+ "sdot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z10.s, z17.b, z0.b[1]\n"
+ "sdot z11.s, z16.b, z0.b[1]\n"
"addvl x10, x10, #4\n"
"ble 10f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[2]\n"
+ "sdot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z10.s, z17.b, z0.b[2]\n"
+ "sdot z11.s, z16.b, z0.b[2]\n"
"addvl x10, x10, #4\n"
"ble 10f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[3]\n"
+ "sdot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[3]\n"
+ "sdot z11.s, z16.b, z0.b[3]\n"
"addvl x10, x10, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -244,15 +244,15 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 15f\n"
"14:" // Height 2: no accumulate
"mov z8.s, #0x0\n"
@@ -268,12 +268,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -281,146 +281,146 @@ void sve_hybrid_s8s32_dot_6x4VL (
"b 18f\n"
"17:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"18:" // Height 2: input setup done
"cmp x27, #0x10\n"
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z1.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[0]\n"
+ "sdot z12.s, z17.b, z0.b[0]\n"
+ "sdot z9.s, z16.b, z1.b[0]\n"
+ "sdot z13.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z1.b[0]\n"
+ "sdot z14.s, z17.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"cmp x27, #0x10\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sdot z11.s, z16.b, z1.b[0]\n"
+ "sdot z15.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
"add x26, x26, #0x10\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[1]\n"
+ "sdot z12.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"add x25, x25, #0x10\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "sdot z9.s, z16.b, z1.b[1]\n"
+ "sdot z13.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z10.s, z17.b, z1.b[1]\n"
+ "sdot z14.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "sdot z11.s, z16.b, z1.b[1]\n"
+ "sdot z15.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[2]\n"
+ "sdot z12.s, z17.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "sdot z9.s, z16.b, z1.b[2]\n"
+ "sdot z13.s, z16.b, z0.b[2]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "sdot z10.s, z17.b, z1.b[2]\n"
+ "sdot z14.s, z17.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "sdot z11.s, z16.b, z1.b[2]\n"
+ "sdot z15.s, z16.b, z0.b[2]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "sdot z8.s, z17.b, z1.b[3]\n"
+ "sdot z12.s, z17.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "sdot z9.s, z16.b, z1.b[3]\n"
+ "sdot z13.s, z16.b, z0.b[3]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sdot z10.s, z17.b, z1.b[3]\n"
+ "sdot z14.s, z17.b, z0.b[3]\n"
+ "sdot z11.s, z16.b, z1.b[3]\n"
+ "sdot z15.s, z16.b, z0.b[3]\n"
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"subs x27, x27, #0x4\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[0]\n"
+ "sdot z12.s, z17.b, z1.b[0]\n"
+ "sdot z9.s, z16.b, z0.b[0]\n"
+ "sdot z13.s, z16.b, z1.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[0]\n"
+ "sdot z14.s, z17.b, z1.b[0]\n"
"addvl x10, x10, #4\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z11.s, z16.b, z0.b[0]\n"
+ "sdot z15.s, z16.b, z1.b[0]\n"
"ble 21f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[1]\n"
+ "sdot z12.s, z17.b, z1.b[1]\n"
+ "sdot z9.s, z16.b, z0.b[1]\n"
+ "sdot z13.s, z16.b, z1.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z10.s, z17.b, z0.b[1]\n"
+ "sdot z14.s, z17.b, z1.b[1]\n"
"addvl x10, x10, #4\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z11.s, z16.b, z0.b[1]\n"
+ "sdot z15.s, z16.b, z1.b[1]\n"
"ble 21f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[2]\n"
+ "sdot z12.s, z17.b, z1.b[2]\n"
+ "sdot z9.s, z16.b, z0.b[2]\n"
+ "sdot z13.s, z16.b, z1.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z10.s, z17.b, z0.b[2]\n"
+ "sdot z14.s, z17.b, z1.b[2]\n"
"addvl x10, x10, #4\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z11.s, z16.b, z0.b[2]\n"
+ "sdot z15.s, z16.b, z1.b[2]\n"
"ble 21f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z17.b, z0.b[3]\n"
+ "sdot z12.s, z17.b, z1.b[3]\n"
+ "sdot z9.s, z16.b, z0.b[3]\n"
+ "sdot z13.s, z16.b, z1.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z17.b, z0.b[3]\n"
+ "sdot z14.s, z17.b, z1.b[3]\n"
"addvl x10, x10, #4\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z11.s, z16.b, z0.b[3]\n"
+ "sdot z15.s, z16.b, z1.b[3]\n"
"21:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x20]\n"
+ "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
"22:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -441,20 +441,20 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23]\n"
- "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20]\n"
+ "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 26f\n"
"25:" // Height 3: no accumulate
"mov z8.s, #0x0\n"
@@ -474,13 +474,13 @@ void sve_hybrid_s8s32_dot_6x4VL (
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 29f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -489,86 +489,86 @@ void sve_hybrid_s8s32_dot_6x4VL (
"b 29f\n"
"28:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"29:" // Height 3: input setup done
"cmp x27, #0x10\n"
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z21.b, z2.b[0]\n"
+ "sdot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z0.b[0]\n"
+ "sdot z9.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[0]\n"
+ "sdot z17.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"cmp x27, #0x10\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z10.s, z21.b, z2.b[0]\n"
+ "sdot z14.s, z21.b, z1.b[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "sdot z18.s, z21.b, z0.b[0]\n"
+ "sdot z11.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "sdot z15.s, z20.b, z1.b[0]\n"
+ "sdot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sdot z8.s, z21.b, z2.b[1]\n"
+ "sdot z12.s, z21.b, z1.b[1]\n"
+ "sdot z16.s, z21.b, z0.b[1]\n"
+ "sdot z9.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[1]\n"
+ "sdot z17.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z10.s, z21.b, z2.b[1]\n"
+ "sdot z14.s, z21.b, z1.b[1]\n"
+ "sdot z18.s, z21.b, z0.b[1]\n"
+ "sdot z11.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "sdot z15.s, z20.b, z1.b[1]\n"
+ "sdot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "sdot z8.s, z21.b, z2.b[2]\n"
+ "sdot z12.s, z21.b, z1.b[2]\n"
+ "sdot z16.s, z21.b, z0.b[2]\n"
+ "sdot z9.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[2]\n"
+ "sdot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "sdot z10.s, z21.b, z2.b[2]\n"
+ "sdot z14.s, z21.b, z1.b[2]\n"
+ "sdot z18.s, z21.b, z0.b[2]\n"
+ "sdot z11.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "sdot z15.s, z20.b, z1.b[2]\n"
+ "sdot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "sdot z8.s, z21.b, z2.b[3]\n"
+ "sdot z12.s, z21.b, z1.b[3]\n"
+ "sdot z16.s, z21.b, z0.b[3]\n"
+ "sdot z9.s, z20.b, z2.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[3]\n"
+ "sdot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sdot z10.s, z21.b, z2.b[3]\n"
+ "sdot z14.s, z21.b, z1.b[3]\n"
+ "sdot z18.s, z21.b, z0.b[3]\n"
+ "sdot z11.s, z20.b, z2.b[3]\n"
+ "sdot z15.s, z20.b, z1.b[3]\n"
+ "sdot z19.s, z20.b, z0.b[3]\n"
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -576,100 +576,100 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z21.b, z0.b[0]\n"
+ "sdot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z16.s, z21.b, z2.b[0]\n"
+ "sdot z9.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[0]\n"
+ "sdot z17.s, z20.b, z2.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z10.s, z21.b, z0.b[0]\n"
+ "sdot z14.s, z21.b, z1.b[0]\n"
+ "sdot z18.s, z21.b, z2.b[0]\n"
+ "sdot z11.s, z20.b, z0.b[0]\n"
+ "sdot z15.s, z20.b, z1.b[0]\n"
+ "sdot z19.s, z20.b, z2.b[0]\n"
"ble 32f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z21.b, z0.b[1]\n"
+ "sdot z12.s, z21.b, z1.b[1]\n"
+ "sdot z16.s, z21.b, z2.b[1]\n"
+ "sdot z9.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[1]\n"
+ "sdot z17.s, z20.b, z2.b[1]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z10.s, z21.b, z0.b[1]\n"
+ "sdot z14.s, z21.b, z1.b[1]\n"
+ "sdot z18.s, z21.b, z2.b[1]\n"
+ "sdot z11.s, z20.b, z0.b[1]\n"
+ "sdot z15.s, z20.b, z1.b[1]\n"
+ "sdot z19.s, z20.b, z2.b[1]\n"
"ble 32f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z21.b, z0.b[2]\n"
+ "sdot z12.s, z21.b, z1.b[2]\n"
+ "sdot z16.s, z21.b, z2.b[2]\n"
+ "sdot z9.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[2]\n"
+ "sdot z17.s, z20.b, z2.b[2]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z10.s, z21.b, z0.b[2]\n"
+ "sdot z14.s, z21.b, z1.b[2]\n"
+ "sdot z18.s, z21.b, z2.b[2]\n"
+ "sdot z11.s, z20.b, z0.b[2]\n"
+ "sdot z15.s, z20.b, z1.b[2]\n"
+ "sdot z19.s, z20.b, z2.b[2]\n"
"ble 32f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z21.b, z0.b[3]\n"
+ "sdot z12.s, z21.b, z1.b[3]\n"
+ "sdot z16.s, z21.b, z2.b[3]\n"
+ "sdot z9.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z20.b, z1.b[3]\n"
+ "sdot z17.s, z20.b, z2.b[3]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z10.s, z21.b, z0.b[3]\n"
+ "sdot z14.s, z21.b, z1.b[3]\n"
+ "sdot z18.s, z21.b, z2.b[3]\n"
+ "sdot z11.s, z20.b, z0.b[3]\n"
+ "sdot z15.s, z20.b, z1.b[3]\n"
+ "sdot z19.s, z20.b, z2.b[3]\n"
"32:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x21]\n"
+ "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x20]\n"
+ "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
"33:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -690,25 +690,25 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23]\n"
- "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21]\n"
+ "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 37f\n"
"36:" // Height 4: no accumulate
"mov z8.s, #0x0\n"
@@ -732,14 +732,14 @@ void sve_hybrid_s8s32_dot_6x4VL (
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -749,105 +749,105 @@ void sve_hybrid_s8s32_dot_6x4VL (
"b 40f\n"
"39:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"40:" // Height 4: input setup done
"cmp x27, #0x10\n"
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z3.b }, p0/Z, [x26]\n"
+ "ld1rqb { z2.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[0]\n"
+ "sdot z12.s, z25.b, z2.b[0]\n"
+ "sdot z16.s, z25.b, z1.b[0]\n"
+ "sdot z20.s, z25.b, z0.b[0]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z9.s, z24.b, z3.b[0]\n"
+ "sdot z13.s, z24.b, z2.b[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "sdot z17.s, z24.b, z1.b[0]\n"
+ "sdot z21.s, z24.b, z0.b[0]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z25.b, z3.b[0]\n"
+ "sdot z14.s, z25.b, z2.b[0]\n"
+ "sdot z18.s, z25.b, z1.b[0]\n"
+ "sdot z22.s, z25.b, z0.b[0]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "sdot z11.s, z24.b, z3.b[0]\n"
+ "sdot z15.s, z24.b, z2.b[0]\n"
+ "sdot z19.s, z24.b, z1.b[0]\n"
+ "sdot z23.s, z24.b, z0.b[0]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[1]\n"
+ "sdot z12.s, z25.b, z2.b[1]\n"
+ "sdot z16.s, z25.b, z1.b[1]\n"
+ "sdot z20.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z9.s, z24.b, z3.b[1]\n"
+ "sdot z13.s, z24.b, z2.b[1]\n"
+ "sdot z17.s, z24.b, z1.b[1]\n"
+ "sdot z21.s, z24.b, z0.b[1]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z10.s, z25.b, z3.b[1]\n"
+ "sdot z14.s, z25.b, z2.b[1]\n"
+ "sdot z18.s, z25.b, z1.b[1]\n"
+ "sdot z22.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "sdot z11.s, z24.b, z3.b[1]\n"
+ "sdot z15.s, z24.b, z2.b[1]\n"
+ "sdot z19.s, z24.b, z1.b[1]\n"
+ "sdot z23.s, z24.b, z0.b[1]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[2]\n"
+ "sdot z12.s, z25.b, z2.b[2]\n"
+ "sdot z16.s, z25.b, z1.b[2]\n"
+ "sdot z20.s, z25.b, z0.b[2]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "sdot z9.s, z24.b, z3.b[2]\n"
+ "sdot z13.s, z24.b, z2.b[2]\n"
+ "sdot z17.s, z24.b, z1.b[2]\n"
+ "sdot z21.s, z24.b, z0.b[2]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "sdot z10.s, z25.b, z3.b[2]\n"
+ "sdot z14.s, z25.b, z2.b[2]\n"
+ "sdot z18.s, z25.b, z1.b[2]\n"
+ "sdot z22.s, z25.b, z0.b[2]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "sdot z11.s, z24.b, z3.b[2]\n"
+ "sdot z15.s, z24.b, z2.b[2]\n"
+ "sdot z19.s, z24.b, z1.b[2]\n"
+ "sdot z23.s, z24.b, z0.b[2]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "sdot z8.s, z25.b, z3.b[3]\n"
+ "sdot z12.s, z25.b, z2.b[3]\n"
+ "sdot z16.s, z25.b, z1.b[3]\n"
+ "sdot z20.s, z25.b, z0.b[3]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "sdot z9.s, z24.b, z3.b[3]\n"
+ "sdot z13.s, z24.b, z2.b[3]\n"
+ "sdot z17.s, z24.b, z1.b[3]\n"
+ "sdot z21.s, z24.b, z0.b[3]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sdot z10.s, z25.b, z3.b[3]\n"
+ "sdot z14.s, z25.b, z2.b[3]\n"
+ "sdot z18.s, z25.b, z1.b[3]\n"
+ "sdot z22.s, z25.b, z0.b[3]\n"
+ "sdot z11.s, z24.b, z3.b[3]\n"
+ "sdot z15.s, z24.b, z2.b[3]\n"
+ "sdot z19.s, z24.b, z1.b[3]\n"
+ "sdot z23.s, z24.b, z0.b[3]\n"
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -856,121 +856,121 @@ void sve_hybrid_s8s32_dot_6x4VL (
"subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[0]\n"
+ "sdot z12.s, z25.b, z1.b[0]\n"
+ "sdot z16.s, z25.b, z2.b[0]\n"
+ "sdot z20.s, z25.b, z3.b[0]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[0]\n"
+ "sdot z13.s, z24.b, z1.b[0]\n"
+ "sdot z17.s, z24.b, z2.b[0]\n"
+ "sdot z21.s, z24.b, z3.b[0]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z10.s, z25.b, z0.b[0]\n"
+ "sdot z14.s, z25.b, z1.b[0]\n"
+ "sdot z18.s, z25.b, z2.b[0]\n"
+ "sdot z22.s, z25.b, z3.b[0]\n"
+ "sdot z11.s, z24.b, z0.b[0]\n"
+ "sdot z15.s, z24.b, z1.b[0]\n"
+ "sdot z19.s, z24.b, z2.b[0]\n"
+ "sdot z23.s, z24.b, z3.b[0]\n"
"ble 43f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[1]\n"
+ "sdot z12.s, z25.b, z1.b[1]\n"
+ "sdot z16.s, z25.b, z2.b[1]\n"
+ "sdot z20.s, z25.b, z3.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[1]\n"
+ "sdot z13.s, z24.b, z1.b[1]\n"
+ "sdot z17.s, z24.b, z2.b[1]\n"
+ "sdot z21.s, z24.b, z3.b[1]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z10.s, z25.b, z0.b[1]\n"
+ "sdot z14.s, z25.b, z1.b[1]\n"
+ "sdot z18.s, z25.b, z2.b[1]\n"
+ "sdot z22.s, z25.b, z3.b[1]\n"
+ "sdot z11.s, z24.b, z0.b[1]\n"
+ "sdot z15.s, z24.b, z1.b[1]\n"
+ "sdot z19.s, z24.b, z2.b[1]\n"
+ "sdot z23.s, z24.b, z3.b[1]\n"
"ble 43f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[2]\n"
+ "sdot z12.s, z25.b, z1.b[2]\n"
+ "sdot z16.s, z25.b, z2.b[2]\n"
+ "sdot z20.s, z25.b, z3.b[2]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[2]\n"
+ "sdot z13.s, z24.b, z1.b[2]\n"
+ "sdot z17.s, z24.b, z2.b[2]\n"
+ "sdot z21.s, z24.b, z3.b[2]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z10.s, z25.b, z0.b[2]\n"
+ "sdot z14.s, z25.b, z1.b[2]\n"
+ "sdot z18.s, z25.b, z2.b[2]\n"
+ "sdot z22.s, z25.b, z3.b[2]\n"
+ "sdot z11.s, z24.b, z0.b[2]\n"
+ "sdot z15.s, z24.b, z1.b[2]\n"
+ "sdot z19.s, z24.b, z2.b[2]\n"
+ "sdot z23.s, z24.b, z3.b[2]\n"
"ble 43f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z25.b, z0.b[3]\n"
+ "sdot z12.s, z25.b, z1.b[3]\n"
+ "sdot z16.s, z25.b, z2.b[3]\n"
+ "sdot z20.s, z25.b, z3.b[3]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z24.b, z0.b[3]\n"
+ "sdot z13.s, z24.b, z1.b[3]\n"
+ "sdot z17.s, z24.b, z2.b[3]\n"
+ "sdot z21.s, z24.b, z3.b[3]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z10.s, z25.b, z0.b[3]\n"
+ "sdot z14.s, z25.b, z1.b[3]\n"
+ "sdot z18.s, z25.b, z2.b[3]\n"
+ "sdot z22.s, z25.b, z3.b[3]\n"
+ "sdot z11.s, z24.b, z0.b[3]\n"
+ "sdot z15.s, z24.b, z1.b[3]\n"
+ "sdot z19.s, z24.b, z2.b[3]\n"
+ "sdot z23.s, z24.b, z3.b[3]\n"
"43:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x22]\n"
- "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x22]\n"
+ "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x21]\n"
+ "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x20]\n"
+ "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
"44:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -991,30 +991,30 @@ void sve_hybrid_s8s32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23]\n"
- "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x21]\n"
- "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 48f\n"
"47:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
@@ -1042,15 +1042,15 @@ void sve_hybrid_s8s32_dot_6x4VL (
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1061,124 +1061,124 @@ void sve_hybrid_s8s32_dot_6x4VL (
"b 51f\n"
"50:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"51:" // Height 5: input setup done
"cmp x27, #0x10\n"
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x26]\n"
+ "ld1rqb { z3.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1rqb { z0.b }, p0/Z, [x22]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z29.b, z4.b[0]\n"
+ "sdot z12.s, z29.b, z3.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z16.s, z29.b, z2.b[0]\n"
+ "sdot z20.s, z29.b, z1.b[0]\n"
"add x25, x25, #0x10\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z24.s, z29.b, z0.b[0]\n"
+ "sdot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"add x24, x24, #0x10\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z13.s, z28.b, z3.b[0]\n"
+ "sdot z17.s, z28.b, z2.b[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "sdot z21.s, z28.b, z1.b[0]\n"
+ "sdot z25.s, z28.b, z0.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z29.b, z4.b[0]\n"
+ "sdot z14.s, z29.b, z3.b[0]\n"
+ "sdot z18.s, z29.b, z2.b[0]\n"
+ "sdot z22.s, z29.b, z1.b[0]\n"
+ "sdot z26.s, z29.b, z0.b[0]\n"
+ "sdot z11.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[0]\n"
+ "sdot z19.s, z28.b, z2.b[0]\n"
+ "sdot z23.s, z28.b, z1.b[0]\n"
+ "sdot z27.s, z28.b, z0.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sdot z8.s, z29.b, z4.b[1]\n"
+ "sdot z12.s, z29.b, z3.b[1]\n"
+ "sdot z16.s, z29.b, z2.b[1]\n"
+ "sdot z20.s, z29.b, z1.b[1]\n"
+ "sdot z24.s, z29.b, z0.b[1]\n"
+ "sdot z9.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z13.s, z28.b, z3.b[1]\n"
+ "sdot z17.s, z28.b, z2.b[1]\n"
+ "sdot z21.s, z28.b, z1.b[1]\n"
+ "sdot z25.s, z28.b, z0.b[1]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z10.s, z29.b, z4.b[1]\n"
+ "sdot z14.s, z29.b, z3.b[1]\n"
+ "sdot z18.s, z29.b, z2.b[1]\n"
+ "sdot z22.s, z29.b, z1.b[1]\n"
+ "sdot z26.s, z29.b, z0.b[1]\n"
+ "sdot z11.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[1]\n"
+ "sdot z19.s, z28.b, z2.b[1]\n"
+ "sdot z23.s, z28.b, z1.b[1]\n"
+ "sdot z27.s, z28.b, z0.b[1]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "sdot z8.s, z29.b, z4.b[2]\n"
+ "sdot z12.s, z29.b, z3.b[2]\n"
+ "sdot z16.s, z29.b, z2.b[2]\n"
+ "sdot z20.s, z29.b, z1.b[2]\n"
+ "sdot z24.s, z29.b, z0.b[2]\n"
+ "sdot z9.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "sdot z13.s, z28.b, z3.b[2]\n"
+ "sdot z17.s, z28.b, z2.b[2]\n"
+ "sdot z21.s, z28.b, z1.b[2]\n"
+ "sdot z25.s, z28.b, z0.b[2]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "sdot z10.s, z29.b, z4.b[2]\n"
+ "sdot z14.s, z29.b, z3.b[2]\n"
+ "sdot z18.s, z29.b, z2.b[2]\n"
+ "sdot z22.s, z29.b, z1.b[2]\n"
+ "sdot z26.s, z29.b, z0.b[2]\n"
+ "sdot z11.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "sdot z15.s, z28.b, z3.b[2]\n"
+ "sdot z19.s, z28.b, z2.b[2]\n"
+ "sdot z23.s, z28.b, z1.b[2]\n"
+ "sdot z27.s, z28.b, z0.b[2]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "sdot z8.s, z29.b, z4.b[3]\n"
+ "sdot z12.s, z29.b, z3.b[3]\n"
+ "sdot z16.s, z29.b, z2.b[3]\n"
+ "sdot z20.s, z29.b, z1.b[3]\n"
+ "sdot z24.s, z29.b, z0.b[3]\n"
+ "sdot z9.s, z28.b, z4.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "sdot z13.s, z28.b, z3.b[3]\n"
+ "sdot z17.s, z28.b, z2.b[3]\n"
+ "sdot z21.s, z28.b, z1.b[3]\n"
+ "sdot z25.s, z28.b, z0.b[3]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sdot z10.s, z29.b, z4.b[3]\n"
+ "sdot z14.s, z29.b, z3.b[3]\n"
+ "sdot z18.s, z29.b, z2.b[3]\n"
+ "sdot z22.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z0.b[3]\n"
+ "sdot z11.s, z28.b, z4.b[3]\n"
+ "sdot z15.s, z28.b, z3.b[3]\n"
+ "sdot z19.s, z28.b, z2.b[3]\n"
+ "sdot z23.s, z28.b, z1.b[3]\n"
+ "sdot z27.s, z28.b, z0.b[3]\n"
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -1188,142 +1188,142 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "sdot z8.s, z29.b, z0.b[0]\n"
+ "sdot z12.s, z29.b, z1.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z16.s, z29.b, z2.b[0]\n"
+ "sdot z20.s, z29.b, z3.b[0]\n"
+ "sdot z24.s, z29.b, z4.b[0]\n"
+ "sdot z9.s, z28.b, z0.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[0]\n"
+ "sdot z17.s, z28.b, z2.b[0]\n"
+ "sdot z21.s, z28.b, z3.b[0]\n"
+ "sdot z25.s, z28.b, z4.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z10.s, z29.b, z0.b[0]\n"
+ "sdot z14.s, z29.b, z1.b[0]\n"
+ "sdot z18.s, z29.b, z2.b[0]\n"
+ "sdot z22.s, z29.b, z3.b[0]\n"
+ "sdot z26.s, z29.b, z4.b[0]\n"
+ "sdot z11.s, z28.b, z0.b[0]\n"
+ "sdot z15.s, z28.b, z1.b[0]\n"
+ "sdot z19.s, z28.b, z2.b[0]\n"
+ "sdot z23.s, z28.b, z3.b[0]\n"
+ "sdot z27.s, z28.b, z4.b[0]\n"
"ble 54f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z29.b, z0.b[1]\n"
+ "sdot z12.s, z29.b, z1.b[1]\n"
+ "sdot z16.s, z29.b, z2.b[1]\n"
+ "sdot z20.s, z29.b, z3.b[1]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z4.b[1]\n"
+ "sdot z9.s, z28.b, z0.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[1]\n"
+ "sdot z17.s, z28.b, z2.b[1]\n"
+ "sdot z21.s, z28.b, z3.b[1]\n"
+ "sdot z25.s, z28.b, z4.b[1]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z10.s, z29.b, z0.b[1]\n"
+ "sdot z14.s, z29.b, z1.b[1]\n"
+ "sdot z18.s, z29.b, z2.b[1]\n"
+ "sdot z22.s, z29.b, z3.b[1]\n"
+ "sdot z26.s, z29.b, z4.b[1]\n"
+ "sdot z11.s, z28.b, z0.b[1]\n"
+ "sdot z15.s, z28.b, z1.b[1]\n"
+ "sdot z19.s, z28.b, z2.b[1]\n"
+ "sdot z23.s, z28.b, z3.b[1]\n"
+ "sdot z27.s, z28.b, z4.b[1]\n"
"ble 54f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z29.b, z0.b[2]\n"
+ "sdot z12.s, z29.b, z1.b[2]\n"
+ "sdot z16.s, z29.b, z2.b[2]\n"
+ "sdot z20.s, z29.b, z3.b[2]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z4.b[2]\n"
+ "sdot z9.s, z28.b, z0.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[2]\n"
+ "sdot z17.s, z28.b, z2.b[2]\n"
+ "sdot z21.s, z28.b, z3.b[2]\n"
+ "sdot z25.s, z28.b, z4.b[2]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z10.s, z29.b, z0.b[2]\n"
+ "sdot z14.s, z29.b, z1.b[2]\n"
+ "sdot z18.s, z29.b, z2.b[2]\n"
+ "sdot z22.s, z29.b, z3.b[2]\n"
+ "sdot z26.s, z29.b, z4.b[2]\n"
+ "sdot z11.s, z28.b, z0.b[2]\n"
+ "sdot z15.s, z28.b, z1.b[2]\n"
+ "sdot z19.s, z28.b, z2.b[2]\n"
+ "sdot z23.s, z28.b, z3.b[2]\n"
+ "sdot z27.s, z28.b, z4.b[2]\n"
"ble 54f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z29.b, z0.b[3]\n"
+ "sdot z12.s, z29.b, z1.b[3]\n"
+ "sdot z16.s, z29.b, z2.b[3]\n"
+ "sdot z20.s, z29.b, z3.b[3]\n"
+ "sdot z24.s, z29.b, z4.b[3]\n"
+ "sdot z9.s, z28.b, z0.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z13.s, z28.b, z1.b[3]\n"
+ "sdot z17.s, z28.b, z2.b[3]\n"
+ "sdot z21.s, z28.b, z3.b[3]\n"
+ "sdot z25.s, z28.b, z4.b[3]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z10.s, z29.b, z0.b[3]\n"
+ "sdot z14.s, z29.b, z1.b[3]\n"
+ "sdot z18.s, z29.b, z2.b[3]\n"
+ "sdot z22.s, z29.b, z3.b[3]\n"
+ "sdot z26.s, z29.b, z4.b[3]\n"
+ "sdot z11.s, z28.b, z0.b[3]\n"
+ "sdot z15.s, z28.b, z1.b[3]\n"
+ "sdot z19.s, z28.b, z2.b[3]\n"
+ "sdot z23.s, z28.b, z3.b[3]\n"
+ "sdot z27.s, z28.b, z4.b[3]\n"
"54:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "st1w { z8.s }, p4, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x22]\n"
- "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x21]\n"
- "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
"55:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1407,16 +1407,16 @@ void sve_hybrid_s8s32_dot_6x4VL (
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 62f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1428,143 +1428,143 @@ void sve_hybrid_s8s32_dot_6x4VL (
"b 62f\n"
"61:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"62:" // Height 6: input setup done
"cmp x27, #0x10\n"
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z6.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z4.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1rqb { z5.b }, p0/Z, [x21]\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[0]\n"
+ "sdot z12.s, z1.b, z6.b[0]\n"
+ "sdot z16.s, z1.b, z5.b[0]\n"
+ "sdot z20.s, z1.b, z4.b[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z24.s, z1.b, z3.b[0]\n"
+ "sdot z28.s, z1.b, z2.b[0]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
"add x21, x21, #0x10\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z30.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
- "sdot z31.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[0]\n"
+ "sdot z13.s, z0.b, z6.b[0]\n"
+ "sdot z17.s, z0.b, z5.b[0]\n"
+ "sdot z21.s, z0.b, z4.b[0]\n"
+ "sdot z25.s, z0.b, z3.b[0]\n"
+ "sdot z29.s, z0.b, z2.b[0]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z10.s, z1.b, z7.b[0]\n"
+ "sdot z14.s, z1.b, z6.b[0]\n"
+ "sdot z18.s, z1.b, z5.b[0]\n"
+ "sdot z22.s, z1.b, z4.b[0]\n"
+ "sdot z26.s, z1.b, z3.b[0]\n"
+ "sdot z30.s, z1.b, z2.b[0]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "sdot z11.s, z0.b, z7.b[0]\n"
+ "sdot z15.s, z0.b, z6.b[0]\n"
+ "sdot z19.s, z0.b, z5.b[0]\n"
+ "sdot z23.s, z0.b, z4.b[0]\n"
+ "sdot z27.s, z0.b, z3.b[0]\n"
+ "sdot z31.s, z0.b, z2.b[0]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[1]\n"
+ "sdot z12.s, z1.b, z6.b[1]\n"
+ "sdot z16.s, z1.b, z5.b[1]\n"
+ "sdot z20.s, z1.b, z4.b[1]\n"
+ "sdot z24.s, z1.b, z3.b[1]\n"
+ "sdot z28.s, z1.b, z2.b[1]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[1]\n"
+ "sdot z13.s, z0.b, z6.b[1]\n"
+ "sdot z17.s, z0.b, z5.b[1]\n"
+ "sdot z21.s, z0.b, z4.b[1]\n"
+ "sdot z25.s, z0.b, z3.b[1]\n"
+ "sdot z29.s, z0.b, z2.b[1]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z30.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
- "sdot z31.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z30.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
- "sdot z31.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z30.s, z6.b, z5.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
- "sdot z31.s, z7.b, z5.b[3]\n"
+ "sdot z10.s, z1.b, z7.b[1]\n"
+ "sdot z14.s, z1.b, z6.b[1]\n"
+ "sdot z18.s, z1.b, z5.b[1]\n"
+ "sdot z22.s, z1.b, z4.b[1]\n"
+ "sdot z26.s, z1.b, z3.b[1]\n"
+ "sdot z30.s, z1.b, z2.b[1]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "sdot z11.s, z0.b, z7.b[1]\n"
+ "sdot z15.s, z0.b, z6.b[1]\n"
+ "sdot z19.s, z0.b, z5.b[1]\n"
+ "sdot z23.s, z0.b, z4.b[1]\n"
+ "sdot z27.s, z0.b, z3.b[1]\n"
+ "sdot z31.s, z0.b, z2.b[1]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[2]\n"
+ "sdot z12.s, z1.b, z6.b[2]\n"
+ "sdot z16.s, z1.b, z5.b[2]\n"
+ "sdot z20.s, z1.b, z4.b[2]\n"
+ "sdot z24.s, z1.b, z3.b[2]\n"
+ "sdot z28.s, z1.b, z2.b[2]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[2]\n"
+ "sdot z13.s, z0.b, z6.b[2]\n"
+ "sdot z17.s, z0.b, z5.b[2]\n"
+ "sdot z21.s, z0.b, z4.b[2]\n"
+ "sdot z25.s, z0.b, z3.b[2]\n"
+ "sdot z29.s, z0.b, z2.b[2]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "sdot z10.s, z1.b, z7.b[2]\n"
+ "sdot z14.s, z1.b, z6.b[2]\n"
+ "sdot z18.s, z1.b, z5.b[2]\n"
+ "sdot z22.s, z1.b, z4.b[2]\n"
+ "sdot z26.s, z1.b, z3.b[2]\n"
+ "sdot z30.s, z1.b, z2.b[2]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "sdot z11.s, z0.b, z7.b[2]\n"
+ "sdot z15.s, z0.b, z6.b[2]\n"
+ "sdot z19.s, z0.b, z5.b[2]\n"
+ "sdot z23.s, z0.b, z4.b[2]\n"
+ "sdot z27.s, z0.b, z3.b[2]\n"
+ "sdot z31.s, z0.b, z2.b[2]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "sdot z8.s, z1.b, z7.b[3]\n"
+ "sdot z12.s, z1.b, z6.b[3]\n"
+ "sdot z16.s, z1.b, z5.b[3]\n"
+ "sdot z20.s, z1.b, z4.b[3]\n"
+ "sdot z24.s, z1.b, z3.b[3]\n"
+ "sdot z28.s, z1.b, z2.b[3]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "sdot z9.s, z0.b, z7.b[3]\n"
+ "sdot z13.s, z0.b, z6.b[3]\n"
+ "sdot z17.s, z0.b, z5.b[3]\n"
+ "sdot z21.s, z0.b, z4.b[3]\n"
+ "sdot z25.s, z0.b, z3.b[3]\n"
+ "sdot z29.s, z0.b, z2.b[3]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "sdot z10.s, z1.b, z7.b[3]\n"
+ "sdot z14.s, z1.b, z6.b[3]\n"
+ "sdot z18.s, z1.b, z5.b[3]\n"
+ "sdot z22.s, z1.b, z4.b[3]\n"
+ "sdot z26.s, z1.b, z3.b[3]\n"
+ "sdot z30.s, z1.b, z2.b[3]\n"
+ "sdot z11.s, z0.b, z7.b[3]\n"
+ "sdot z15.s, z0.b, z6.b[3]\n"
+ "sdot z19.s, z0.b, z5.b[3]\n"
+ "sdot z23.s, z0.b, z4.b[3]\n"
+ "sdot z27.s, z0.b, z3.b[3]\n"
+ "sdot z31.s, z0.b, z2.b[3]\n"
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -1575,127 +1575,127 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[0]\n"
- "sdot z12.s, z6.b, z1.b[0]\n"
- "sdot z16.s, z6.b, z2.b[0]\n"
- "sdot z20.s, z6.b, z3.b[0]\n"
- "sdot z24.s, z6.b, z4.b[0]\n"
- "sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[0]\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "sdot z17.s, z7.b, z2.b[0]\n"
- "sdot z21.s, z7.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
- "sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[0]\n"
+ "sdot z12.s, z7.b, z1.b[0]\n"
+ "sdot z16.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z7.b, z3.b[0]\n"
+ "sdot z24.s, z7.b, z4.b[0]\n"
+ "sdot z28.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[0]\n"
+ "sdot z13.s, z6.b, z1.b[0]\n"
+ "sdot z17.s, z6.b, z2.b[0]\n"
+ "sdot z21.s, z6.b, z3.b[0]\n"
+ "sdot z25.s, z6.b, z4.b[0]\n"
+ "sdot z29.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[0]\n"
- "sdot z14.s, z6.b, z1.b[0]\n"
- "sdot z18.s, z6.b, z2.b[0]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z26.s, z6.b, z4.b[0]\n"
- "sdot z30.s, z6.b, z5.b[0]\n"
- "sdot z11.s, z7.b, z0.b[0]\n"
- "sdot z15.s, z7.b, z1.b[0]\n"
- "sdot z19.s, z7.b, z2.b[0]\n"
- "sdot z23.s, z7.b, z3.b[0]\n"
- "sdot z27.s, z7.b, z4.b[0]\n"
- "sdot z31.s, z7.b, z5.b[0]\n"
+ "sdot z10.s, z7.b, z0.b[0]\n"
+ "sdot z14.s, z7.b, z1.b[0]\n"
+ "sdot z18.s, z7.b, z2.b[0]\n"
+ "sdot z22.s, z7.b, z3.b[0]\n"
+ "sdot z26.s, z7.b, z4.b[0]\n"
+ "sdot z30.s, z7.b, z5.b[0]\n"
+ "sdot z11.s, z6.b, z0.b[0]\n"
+ "sdot z15.s, z6.b, z1.b[0]\n"
+ "sdot z19.s, z6.b, z2.b[0]\n"
+ "sdot z23.s, z6.b, z3.b[0]\n"
+ "sdot z27.s, z6.b, z4.b[0]\n"
+ "sdot z31.s, z6.b, z5.b[0]\n"
"ble 65f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[1]\n"
- "sdot z12.s, z6.b, z1.b[1]\n"
- "sdot z16.s, z6.b, z2.b[1]\n"
- "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[1]\n"
+ "sdot z12.s, z7.b, z1.b[1]\n"
+ "sdot z16.s, z7.b, z2.b[1]\n"
+ "sdot z20.s, z7.b, z3.b[1]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[1]\n"
- "sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[1]\n"
- "sdot z13.s, z7.b, z1.b[1]\n"
- "sdot z17.s, z7.b, z2.b[1]\n"
- "sdot z21.s, z7.b, z3.b[1]\n"
- "sdot z25.s, z7.b, z4.b[1]\n"
- "sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z4.b[1]\n"
+ "sdot z28.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[1]\n"
+ "sdot z13.s, z6.b, z1.b[1]\n"
+ "sdot z17.s, z6.b, z2.b[1]\n"
+ "sdot z21.s, z6.b, z3.b[1]\n"
+ "sdot z25.s, z6.b, z4.b[1]\n"
+ "sdot z29.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[1]\n"
- "sdot z14.s, z6.b, z1.b[1]\n"
- "sdot z18.s, z6.b, z2.b[1]\n"
- "sdot z22.s, z6.b, z3.b[1]\n"
- "sdot z26.s, z6.b, z4.b[1]\n"
- "sdot z30.s, z6.b, z5.b[1]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z15.s, z7.b, z1.b[1]\n"
- "sdot z19.s, z7.b, z2.b[1]\n"
- "sdot z23.s, z7.b, z3.b[1]\n"
- "sdot z27.s, z7.b, z4.b[1]\n"
- "sdot z31.s, z7.b, z5.b[1]\n"
+ "sdot z10.s, z7.b, z0.b[1]\n"
+ "sdot z14.s, z7.b, z1.b[1]\n"
+ "sdot z18.s, z7.b, z2.b[1]\n"
+ "sdot z22.s, z7.b, z3.b[1]\n"
+ "sdot z26.s, z7.b, z4.b[1]\n"
+ "sdot z30.s, z7.b, z5.b[1]\n"
+ "sdot z11.s, z6.b, z0.b[1]\n"
+ "sdot z15.s, z6.b, z1.b[1]\n"
+ "sdot z19.s, z6.b, z2.b[1]\n"
+ "sdot z23.s, z6.b, z3.b[1]\n"
+ "sdot z27.s, z6.b, z4.b[1]\n"
+ "sdot z31.s, z6.b, z5.b[1]\n"
"ble 65f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[2]\n"
- "sdot z12.s, z6.b, z1.b[2]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[2]\n"
+ "sdot z12.s, z7.b, z1.b[2]\n"
+ "sdot z16.s, z7.b, z2.b[2]\n"
+ "sdot z20.s, z7.b, z3.b[2]\n"
"subs x27, x27, #0x4\n"
- "sdot z24.s, z6.b, z4.b[2]\n"
- "sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[2]\n"
- "sdot z13.s, z7.b, z1.b[2]\n"
- "sdot z17.s, z7.b, z2.b[2]\n"
- "sdot z21.s, z7.b, z3.b[2]\n"
- "sdot z25.s, z7.b, z4.b[2]\n"
- "sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z4.b[2]\n"
+ "sdot z28.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[2]\n"
+ "sdot z13.s, z6.b, z1.b[2]\n"
+ "sdot z17.s, z6.b, z2.b[2]\n"
+ "sdot z21.s, z6.b, z3.b[2]\n"
+ "sdot z25.s, z6.b, z4.b[2]\n"
+ "sdot z29.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[2]\n"
- "sdot z14.s, z6.b, z1.b[2]\n"
- "sdot z18.s, z6.b, z2.b[2]\n"
- "sdot z22.s, z6.b, z3.b[2]\n"
- "sdot z26.s, z6.b, z4.b[2]\n"
- "sdot z30.s, z6.b, z5.b[2]\n"
- "sdot z11.s, z7.b, z0.b[2]\n"
- "sdot z15.s, z7.b, z1.b[2]\n"
- "sdot z19.s, z7.b, z2.b[2]\n"
- "sdot z23.s, z7.b, z3.b[2]\n"
- "sdot z27.s, z7.b, z4.b[2]\n"
- "sdot z31.s, z7.b, z5.b[2]\n"
+ "sdot z10.s, z7.b, z0.b[2]\n"
+ "sdot z14.s, z7.b, z1.b[2]\n"
+ "sdot z18.s, z7.b, z2.b[2]\n"
+ "sdot z22.s, z7.b, z3.b[2]\n"
+ "sdot z26.s, z7.b, z4.b[2]\n"
+ "sdot z30.s, z7.b, z5.b[2]\n"
+ "sdot z11.s, z6.b, z0.b[2]\n"
+ "sdot z15.s, z6.b, z1.b[2]\n"
+ "sdot z19.s, z6.b, z2.b[2]\n"
+ "sdot z23.s, z6.b, z3.b[2]\n"
+ "sdot z27.s, z6.b, z4.b[2]\n"
+ "sdot z31.s, z6.b, z5.b[2]\n"
"ble 65f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "sdot z8.s, z6.b, z0.b[3]\n"
- "sdot z12.s, z6.b, z1.b[3]\n"
- "sdot z16.s, z6.b, z2.b[3]\n"
- "sdot z20.s, z6.b, z3.b[3]\n"
- "sdot z24.s, z6.b, z4.b[3]\n"
- "sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "sdot z9.s, z7.b, z0.b[3]\n"
- "sdot z13.s, z7.b, z1.b[3]\n"
- "sdot z17.s, z7.b, z2.b[3]\n"
- "sdot z21.s, z7.b, z3.b[3]\n"
- "sdot z25.s, z7.b, z4.b[3]\n"
- "sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "sdot z8.s, z7.b, z0.b[3]\n"
+ "sdot z12.s, z7.b, z1.b[3]\n"
+ "sdot z16.s, z7.b, z2.b[3]\n"
+ "sdot z20.s, z7.b, z3.b[3]\n"
+ "sdot z24.s, z7.b, z4.b[3]\n"
+ "sdot z28.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "sdot z9.s, z6.b, z0.b[3]\n"
+ "sdot z13.s, z6.b, z1.b[3]\n"
+ "sdot z17.s, z6.b, z2.b[3]\n"
+ "sdot z21.s, z6.b, z3.b[3]\n"
+ "sdot z25.s, z6.b, z4.b[3]\n"
+ "sdot z29.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "sdot z10.s, z6.b, z0.b[3]\n"
- "sdot z14.s, z6.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[3]\n"
- "sdot z26.s, z6.b, z4.b[3]\n"
- "sdot z30.s, z6.b, z5.b[3]\n"
- "sdot z11.s, z7.b, z0.b[3]\n"
- "sdot z15.s, z7.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z2.b[3]\n"
- "sdot z23.s, z7.b, z3.b[3]\n"
- "sdot z27.s, z7.b, z4.b[3]\n"
- "sdot z31.s, z7.b, z5.b[3]\n"
+ "sdot z10.s, z7.b, z0.b[3]\n"
+ "sdot z14.s, z7.b, z1.b[3]\n"
+ "sdot z18.s, z7.b, z2.b[3]\n"
+ "sdot z22.s, z7.b, z3.b[3]\n"
+ "sdot z26.s, z7.b, z4.b[3]\n"
+ "sdot z30.s, z7.b, z5.b[3]\n"
+ "sdot z11.s, z6.b, z0.b[3]\n"
+ "sdot z15.s, z6.b, z1.b[3]\n"
+ "sdot z19.s, z6.b, z2.b[3]\n"
+ "sdot z23.s, z6.b, z3.b[3]\n"
+ "sdot z27.s, z6.b, z4.b[3]\n"
+ "sdot z31.s, z6.b, z5.b[3]\n"
"65:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1748,7 +1748,6 @@ void sve_hybrid_s8s32_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1756,4 +1755,4 @@ void sve_hybrid_s8s32_dot_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index c08977570e..686295496e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, int32_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -86,7 +85,6 @@ public:
}
}
-
if (std::is_same<T, int8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -111,5 +109,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
index 350425647a..f66b6345ea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
@@ -100,16 +100,16 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"incw x20\n"
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 3f\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 4f\n"
@@ -127,11 +127,11 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 7f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -143,86 +143,86 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45109a8a // smmla z10.s, z20.b, z16.b\n"
+ ".inst 0x45079a8e // smmla z14.s, z20.b, z7.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
"add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"subs x27, x27, #0x8\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
"addvl x10, x10, #8\n"
"ble 10f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
+ ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
+ ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
+ ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
+ ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
"addvl x10, x10, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -258,21 +258,21 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x9, x20, LSL #2\n"
+ "ld1w { z18.s }, p4/Z, [x9]\n"
+ "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z18.d, z12.d\n"
+ "zip2 z12.d, z18.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z2.d, z13.d\n"
+ "zip2 z13.d, z2.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 15f\n"
@@ -290,12 +290,12 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -303,95 +303,95 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"b 18f\n"
"17:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"18:" // Height 2: input setup done
"cmp x27, #0x10\n"
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n"
+ ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n"
+ ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n"
+ ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n"
+ ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n"
+ ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n"
+ ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n"
+ ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"subs x27, x27, #0x8\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n"
+ ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n"
"addvl x10, x10, #8\n"
"ble 21f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n"
+ ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n"
+ ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n"
+ ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n"
+ ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n"
"addvl x10, x10, #8\n"
"21:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -399,24 +399,24 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "add x20, x9, x20, LSL #2\n"
+ "uzp1 z16.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
+ "uzp1 z17.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "st1w { z7.s }, p4, [x9]\n"
- "uzp1 z13.d, z10.d, z14.d\n"
+ "st1w { z16.s }, p4, [x9]\n"
+ "uzp1 z16.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z14.d, z11.d, z15.d\n"
+ "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z2.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x24]\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x20]\n"
+ "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
"22:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -437,28 +437,28 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -490,13 +490,13 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 29f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -505,169 +505,169 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"b 29f\n"
"28:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"29:" // Height 3: input setup done
"cmp x27, #0x10\n"
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
+ ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
"cmp x27, #0x10\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
+ ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
+ ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
+ ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
+ ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
+ ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
+ ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
+ ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
+ ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn1 z27.d, z1.d, z24.d\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
+ ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
+ ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 32f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
+ ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
+ ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
+ ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
+ ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n"
+ ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n"
+ ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n"
+ ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n"
+ ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n"
+ ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n"
+ ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n"
"32:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "uzp1 z25.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+ "uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
"uzp1 z16.d, z16.d, z20.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"uzp1 z17.d, z17.d, z21.d\n"
"uzp1 z18.d, z18.d, z22.d\n"
- "st1w { z8.s }, p4, [x24]\n"
+ "st1w { z8.s }, p4, [x21]\n"
"uzp1 z19.d, z19.d, z23.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x20]\n"
+ "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
"33:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -688,37 +688,37 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -746,14 +746,14 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -763,182 +763,182 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"b 40f\n"
"39:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"40:" // Height 4: input setup done
"cmp x27, #0x10\n"
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
"cmp x27, #0x10\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n"
+ ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n"
+ ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n"
+ ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n"
+ ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n"
+ ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n"
+ ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n"
+ ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n"
+ ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n"
+ ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n"
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199b88 // smmla z8.s, z28.b, z25.b\n"
+ ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n"
+ ".inst 0x45189b8c // smmla z12.s, z28.b, z24.b\n"
+ ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45199b89 // smmla z9.s, z28.b, z25.b\n"
+ ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45189b8d // smmla z13.s, z28.b, z24.b\n"
+ ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x45199b8a // smmla z10.s, z28.b, z25.b\n"
+ ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45189b8e // smmla z14.s, z28.b, z24.b\n"
+ ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
+ ".inst 0x45199b8b // smmla z11.s, z28.b, z25.b\n"
+ ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n"
+ ".inst 0x45189b8f // smmla z15.s, z28.b, z24.b\n"
+ ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n"
"ble 43f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n"
+ ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n"
+ ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n"
+ ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n"
+ ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n"
+ ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n"
+ ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n"
+ ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
+ ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n"
+ ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n"
+ ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n"
+ ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n"
"43:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "uzp1 z25.d, z8.d, z12.d\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+ "uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "uzp1 z15.d, z16.d, z20.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "uzp1 z25.d, z16.d, z20.d\n"
+ "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "st1w { z8.s }, p4, [x24]\n"
+ "uzp1 z24.d, z17.d, z21.d\n"
+ "st1w { z8.s }, p4, [x22]\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+ "uzp1 z20.d, z19.d, z23.d\n"
+ "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x23]\n"
- "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x22]\n"
- "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z25.s }, p4, [x21]\n"
+ "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x20]\n"
+ "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
"44:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -959,54 +959,54 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 48f\n"
"47:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
@@ -1038,15 +1038,15 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1057,231 +1057,231 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"b 51f\n"
"50:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"51:" // Height 5: input setup done
"cmp x27, #0x10\n"
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqb { z6.b }, p0/Z, [x26]\n"
+ "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z7.b }, p0/Z, [x24]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "trn1 z3.d, z7.d, z2.d\n"
+ "trn2 z7.d, z7.d, z2.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x450198a8 // smmla z8.s, z5.b, z1.b\n"
+ ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n"
+ ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
+ ".inst 0x450098ac // smmla z12.s, z5.b, z0.b\n"
+ ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
+ ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x450198a9 // smmla z9.s, z5.b, z1.b\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n"
+ ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
+ ".inst 0x450098ad // smmla z13.s, z5.b, z0.b\n"
+ ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x450198aa // smmla z10.s, z5.b, z1.b\n"
+ ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n"
+ ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x450098ae // smmla z14.s, z5.b, z0.b\n"
+ ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x450198ab // smmla z11.s, z5.b, z1.b\n"
+ ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n"
+ ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x450098af // smmla z15.s, z5.b, z0.b\n"
+ ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
+ ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
+ ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n"
+ ".inst 0x45019898 // smmla z24.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
+ ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
+ ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n"
+ ".inst 0x45019899 // smmla z25.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
+ ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
+ ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n"
+ ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
+ ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
+ ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n"
+ ".inst 0x4501989b // smmla z27.s, z4.b, z1.b\n"
+ ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
+ ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
+ ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
+ ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n"
+ ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n"
+ ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n"
+ ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
+ ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
"addvl x10, x10, #8\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
+ ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
+ ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"ble 54f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
+ ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
+ ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
+ ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
+ ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
+ ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
+ ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
+ ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
+ ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
+ ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n"
+ ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n"
+ ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n"
+ ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n"
+ ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
+ ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
"54:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "uzp1 z2.d, z8.d, z12.d\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
+ "uzp1 z1.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "uzp1 z0.d, z10.d, z14.d\n"
+ "st1w { z2.s }, p4, [x9]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z2.d, z11.d, z15.d\n"
+ "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "uzp1 z15.d, z16.d, z20.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+ "uzp1 z1.d, z16.d, z20.d\n"
+ "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "uzp1 z0.d, z17.d, z21.d\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
- "st1w { z8.s }, p4, [x24]\n"
+ "st1w { z8.s }, p4, [x23]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+ "uzp1 z20.d, z19.d, z23.d\n"
+ "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
"uzp1 z24.d, z24.d, z28.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
"uzp1 z27.d, z27.d, z31.d\n"
- "st1w { z15.s }, p4, [x23]\n"
- "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x22]\n"
- "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x21]\n"
- "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z1.s }, p4, [x22]\n"
+ "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x21]\n"
+ "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
"55:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1307,26 +1307,26 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
+ "zip1 z8.d, z17.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "zip1 z9.d, z10.d, z13.d\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip2 z14.d, z20.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
@@ -1344,7 +1344,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
@@ -1356,8 +1356,8 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 59f\n"
"58:" // Height 6: no accumulate
"mov z8.s, #0x0\n"
@@ -1389,16 +1389,16 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 62f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1410,184 +1410,184 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"b 62f\n"
"61:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"62:" // Height 6: input setup done
"cmp x27, #0x10\n"
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n"
+ ".inst 0x45019890 // smmla z16.s, z4.b, z1.b\n"
+ ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
+ ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n"
+ ".inst 0x45009894 // smmla z20.s, z4.b, z0.b\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
+ ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n"
"add x25, x25, #0x10\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45019891 // smmla z17.s, z4.b, z1.b\n"
+ ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
+ ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n"
+ ".inst 0x45009895 // smmla z21.s, z4.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
+ ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n"
"add x21, x21, #0x10\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45019892 // smmla z18.s, z4.b, z1.b\n"
+ ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n"
+ ".inst 0x45009896 // smmla z22.s, z4.b, z0.b\n"
+ ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n"
+ ".inst 0x45019893 // smmla z19.s, z4.b, z1.b\n"
+ ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n"
+ ".inst 0x45009897 // smmla z23.s, z4.b, z0.b\n"
+ ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
+ ".inst 0x450198b0 // smmla z16.s, z5.b, z1.b\n"
+ ".inst 0x45019878 // smmla z24.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098b4 // smmla z20.s, z5.b, z0.b\n"
+ ".inst 0x4500987c // smmla z28.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x450198e9 // smmla z9.s, z7.b, z1.b\n"
+ ".inst 0x450198b1 // smmla z17.s, z5.b, z1.b\n"
+ ".inst 0x45019879 // smmla z25.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098b5 // smmla z21.s, z5.b, z0.b\n"
+ ".inst 0x4500987d // smmla z29.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n"
+ ".inst 0x450198b2 // smmla z18.s, z5.b, z1.b\n"
+ ".inst 0x4501987a // smmla z26.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098b6 // smmla z22.s, z5.b, z0.b\n"
+ ".inst 0x4500987e // smmla z30.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x450198eb // smmla z11.s, z7.b, z1.b\n"
+ ".inst 0x450198b3 // smmla z19.s, z5.b, z1.b\n"
+ ".inst 0x4501987b // smmla z27.s, z3.b, z1.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n"
+ ".inst 0x4500987f // smmla z31.s, z3.b, z0.b\n"
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n"
- ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n"
- ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n"
+ ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n"
+ ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
- ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n"
- ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n"
- ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n"
- ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
+ ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n"
+ ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n"
+ ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n"
+ ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n"
+ ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n"
+ ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
+ ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n"
+ ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n"
+ ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n"
"addvl x10, x10, #8\n"
- ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n"
- ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n"
- ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n"
- ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n"
- ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n"
+ ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n"
+ ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n"
+ ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n"
+ ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"ble 65f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n"
- ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n"
- ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n"
- ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n"
- ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n"
- ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n"
- ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
- ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n"
- ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n"
- ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n"
- ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n"
- ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n"
+ ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n"
+ ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
+ ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n"
+ ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n"
+ ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n"
+ ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n"
+ ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n"
+ ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n"
+ ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n"
+ ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n"
+ ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n"
+ ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n"
- ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n"
- ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n"
- ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n"
+ ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n"
+ ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n"
+ ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n"
+ ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n"
+ ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n"
+ ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n"
"65:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1596,7 +1596,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "uzp1 z0.d, z8.d, z12.d\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
@@ -1604,7 +1604,7 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"add x20, x21, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "st1w { z0.s }, p4, [x9]\n"
"uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z14.d, z11.d, z15.d\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1664,7 +1664,6 @@ void sve_hybrid_s8s32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1672,4 +1671,4 @@ void sve_hybrid_s8s32_mmla_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index c66ebedc4d..11fe5ce7e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, uint8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -97,5 +96,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 79bd563a4b..e74b424888 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -104,11 +104,11 @@ void sve_hybrid_u8qa_dot_4x4VL (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -121,39 +121,39 @@ void sve_hybrid_u8qa_dot_4x4VL (
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "udot z16.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z17.s, z21.b, z0.b[0]\n"
+ "udot z18.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "udot z16.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "udot z17.s, z21.b, z0.b[1]\n"
+ "udot z18.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "udot z19.s, z4.b, z0.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "udot z16.s, z22.b, z0.b[2]\n"
+ "udot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "udot z18.s, z21.b, z0.b[2]\n"
+ "udot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "udot z16.s, z22.b, z0.b[3]\n"
+ "udot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "udot z18.s, z21.b, z0.b[3]\n"
+ "udot z19.s, z20.b, z0.b[3]\n"
"add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -164,47 +164,47 @@ void sve_hybrid_u8qa_dot_4x4VL (
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z0.b }, p0/Z, [x24]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
+ "ld1b { z22.b }, p2/Z, [x28]\n"
"subs x25, x25, #0x4\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z16.s, z22.b, z0.b[0]\n"
+ "udot z17.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z18.s, z21.b, z0.b[0]\n"
+ "udot z19.s, z20.b, z0.b[0]\n"
"addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z16.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z22.b, z0.b[1]\n"
+ "udot z18.s, z21.b, z0.b[1]\n"
+ "udot z19.s, z20.b, z0.b[1]\n"
"addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28]\n"
+ "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z16.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z22.b, z0.b[2]\n"
+ "udot z18.s, z21.b, z0.b[2]\n"
+ "udot z19.s, z20.b, z0.b[2]\n"
"addvl x28, x28, #4\n"
"ble 10f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x28]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z21.b, z0.b[3]\n"
+ "udot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z18.s, z21.b, z0.b[3]\n"
+ "udot z19.s, z20.b, z0.b[3]\n"
"addvl x28, x28, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
@@ -218,71 +218,71 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
"uaddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
- "neg z1.s, p2/M, z1.s\n"
- "mul z11.s, p2/M, z11.s, z1.s\n"
+ "neg z20.s, p2/M, z20.s\n"
+ "mul z11.s, p2/M, z11.s, z20.s\n"
"12:" // Height 1: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [x10]\n"
+ "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "add z16.s, z16.s, z23.s\n"
+ "add z17.s, z17.s, z22.s\n"
+ "add z18.s, z18.s, z21.s\n"
+ "add z19.s, z19.s, z20.s\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
+ ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n"
"addvl x10, x10, #4\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
+ ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
"tbz %x[flags], #5, 13f\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z23.d, z16.d, z0.d\n"
+ "and z22.d, z17.d, z0.d\n"
+ "and z21.d, z18.d, z0.d\n"
+ "and z20.d, z19.d, z0.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "sqadd z17.s, z17.s, z22.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
"13:" // Height 1: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z20.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z20.s\n"
+ "add z18.s, z18.s, z20.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z19.s, z19.s, z4.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z20.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z21.s\n"
+ "smin z17.s, p2/M, z17.s, z21.s\n"
+ "smin z18.s, p2/M, z18.s, z21.s\n"
+ "smin z19.s, p2/M, z19.s, z21.s\n"
+ "smax z16.s, p2/M, z16.s, z20.s\n"
+ "smax z17.s, p2/M, z17.s, z20.s\n"
+ "smax z18.s, p2/M, z18.s, z20.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z20.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
"st1b { z16.b }, p1, [x27]\n"
@@ -317,12 +317,12 @@ void sve_hybrid_u8qa_dot_4x4VL (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -330,7 +330,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"20:" // Height 2: input setup done
"cmp x25, #0x10\n"
"ble 23f\n"
@@ -339,56 +339,56 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "udot z21.s, z5.b, z1.b[0]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "udot z23.s, z7.b, z1.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[0]\n"
+ "udot z20.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z26.b, z0.b[0]\n"
+ "udot z21.s, z26.b, z1.b[0]\n"
+ "udot z18.s, z24.b, z0.b[0]\n"
+ "udot z22.s, z24.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "udot z19.s, z25.b, z0.b[0]\n"
+ "udot z23.s, z25.b, z1.b[0]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "udot z20.s, z8.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "udot z21.s, z9.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "udot z22.s, z10.b, z1.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "udot z19.s, z4.b, z0.b[1]\n"
- "udot z23.s, z4.b, z1.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "udot z20.s, z5.b, z1.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[1]\n"
+ "udot z20.s, z24.b, z1.b[1]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "udot z17.s, z27.b, z0.b[1]\n"
+ "udot z21.s, z27.b, z1.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "udot z18.s, z26.b, z0.b[1]\n"
+ "udot z22.s, z26.b, z1.b[1]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "udot z19.s, z25.b, z0.b[1]\n"
+ "udot z23.s, z25.b, z1.b[1]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[2]\n"
+ "udot z20.s, z24.b, z1.b[2]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
"add x23, x23, #0x10\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "udot z21.s, z6.b, z1.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z22.s, z7.b, z1.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
- "udot z23.s, z8.b, z1.b[2]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z20.s, z9.b, z1.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "udot z21.s, z10.b, z1.b[3]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z22.s, z4.b, z1.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z17.s, z30.b, z0.b[2]\n"
+ "udot z21.s, z30.b, z1.b[2]\n"
+ "udot z18.s, z29.b, z0.b[2]\n"
+ "udot z22.s, z29.b, z1.b[2]\n"
+ "udot z19.s, z28.b, z0.b[2]\n"
+ "udot z23.s, z28.b, z1.b[2]\n"
+ "udot z16.s, z27.b, z0.b[3]\n"
+ "udot z20.s, z27.b, z1.b[3]\n"
+ "udot z17.s, z26.b, z0.b[3]\n"
+ "udot z21.s, z26.b, z1.b[3]\n"
+ "udot z18.s, z25.b, z0.b[3]\n"
+ "udot z22.s, z25.b, z1.b[3]\n"
+ "udot z19.s, z24.b, z0.b[3]\n"
+ "udot z23.s, z24.b, z1.b[3]\n"
"tbnz %x[flags], #31, 22f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
@@ -401,63 +401,63 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1rqb { z0.b }, p0/Z, [x24]\n"
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x25, x25, #0x4\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "udot z21.s, z5.b, z1.b[0]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[0]\n"
+ "udot z20.s, z24.b, z1.b[0]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z26.b, z0.b[0]\n"
+ "udot z21.s, z26.b, z1.b[0]\n"
+ "udot z18.s, z25.b, z0.b[0]\n"
+ "udot z22.s, z25.b, z1.b[0]\n"
"addvl x28, x28, #4\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z24.b, z0.b[0]\n"
+ "udot z23.s, z24.b, z1.b[0]\n"
"ble 24f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[1]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "udot z21.s, z9.b, z1.b[1]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z16.s, z27.b, z0.b[1]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z20.s, z27.b, z1.b[1]\n"
+ "udot z17.s, z26.b, z0.b[1]\n"
+ "udot z21.s, z26.b, z1.b[1]\n"
+ "udot z18.s, z25.b, z0.b[1]\n"
"addvl x28, x28, #4\n"
- "udot z22.s, z10.b, z1.b[1]\n"
- "udot z19.s, z4.b, z0.b[1]\n"
- "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z22.s, z25.b, z1.b[1]\n"
+ "udot z19.s, z24.b, z0.b[1]\n"
+ "udot z23.s, z24.b, z1.b[1]\n"
"ble 24f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z20.s, z5.b, z1.b[2]\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "udot z21.s, z6.b, z1.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z16.s, z27.b, z0.b[2]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z20.s, z27.b, z1.b[2]\n"
+ "udot z17.s, z26.b, z0.b[2]\n"
+ "udot z21.s, z26.b, z1.b[2]\n"
+ "udot z18.s, z25.b, z0.b[2]\n"
"addvl x28, x28, #4\n"
- "udot z22.s, z7.b, z1.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
- "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z22.s, z25.b, z1.b[2]\n"
+ "udot z19.s, z24.b, z0.b[2]\n"
+ "udot z23.s, z24.b, z1.b[2]\n"
"ble 24f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z20.s, z9.b, z1.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "udot z21.s, z10.b, z1.b[3]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z22.s, z4.b, z1.b[3]\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z24.b, z0.b[3]\n"
+ "udot z20.s, z24.b, z1.b[3]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z26.b, z0.b[3]\n"
+ "udot z21.s, z26.b, z1.b[3]\n"
+ "udot z18.s, z25.b, z0.b[3]\n"
+ "udot z22.s, z25.b, z1.b[3]\n"
"addvl x28, x28, #4\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z19.s, z24.b, z0.b[3]\n"
+ "udot z23.s, z24.b, z1.b[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 25f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -473,120 +473,120 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z2.s }, p2/Z, [x20]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
"uaddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"uaddv d12, p0, z12.s\n"
- "neg z2.s, p2/M, z2.s\n"
+ "neg z24.s, p2/M, z24.s\n"
"mov z12.s, z12.s[0]\n"
- "mul z11.s, p2/M, z11.s, z2.s\n"
- "mul z12.s, p2/M, z12.s, z2.s\n"
+ "mul z11.s, p2/M, z11.s, z24.s\n"
+ "mul z12.s, p2/M, z12.s, z24.s\n"
"26:" // Height 2: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x10]\n"
+ "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z20.s, z20.s, z12.s\n"
"add z21.s, z21.s, z12.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z22.s, z22.s, z12.s\n"
"add z23.s, z23.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
+ "add z16.s, z16.s, z28.s\n"
+ "add z17.s, z17.s, z27.s\n"
"addvl x10, x10, #4\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
+ "add z18.s, z18.s, z26.s\n"
+ "add z19.s, z19.s, z25.s\n"
+ "add z20.s, z20.s, z28.s\n"
+ "add z21.s, z21.s, z27.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ "add z22.s, z22.s, z26.s\n"
+ "add z23.s, z23.s, z25.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
+ ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
+ ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
+ ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n"
+ ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
- "and z4.d, z16.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "and z8.d, z20.d, z0.d\n"
- "and z9.d, z21.d, z0.d\n"
- "and z10.d, z22.d, z0.d\n"
- "and z4.d, z23.d, z0.d\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "sqadd z20.s, z20.s, z8.s\n"
- "sqadd z21.s, z21.s, z9.s\n"
- "sqadd z22.s, z22.s, z10.s\n"
- "sqadd z23.s, z23.s, z4.s\n"
+ "and z24.d, z16.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z24.s\n"
+ "and z30.d, z17.d, z0.d\n"
+ "and z29.d, z18.d, z0.d\n"
+ "and z28.d, z19.d, z0.d\n"
+ "and z27.d, z20.d, z0.d\n"
+ "and z26.d, z21.d, z0.d\n"
+ "and z25.d, z22.d, z0.d\n"
+ "and z24.d, z23.d, z0.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z30.s\n"
+ "sqadd z18.s, z18.s, z29.s\n"
+ "sqadd z19.s, z19.s, z28.s\n"
+ "sqadd z20.s, z20.s, z27.s\n"
+ "sqadd z21.s, z21.s, z26.s\n"
+ "sqadd z22.s, z22.s, z25.s\n"
+ "sqadd z23.s, z23.s, z24.s\n"
"27:" // Height 2: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add z20.s, z20.s, z24.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z24.s\n"
+ "add z22.s, z22.s, z24.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z23.s, z23.s, z4.s\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
+ "add z23.s, z23.s, z24.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z25.s\n"
+ "smin z17.s, p2/M, z17.s, z25.s\n"
+ "smin z18.s, p2/M, z18.s, z25.s\n"
+ "smin z19.s, p2/M, z19.s, z25.s\n"
+ "smin z20.s, p2/M, z20.s, z25.s\n"
+ "smin z21.s, p2/M, z21.s, z25.s\n"
+ "smin z22.s, p2/M, z22.s, z25.s\n"
+ "smin z23.s, p2/M, z23.s, z25.s\n"
+ "smax z16.s, p2/M, z16.s, z24.s\n"
+ "smax z17.s, p2/M, z17.s, z24.s\n"
+ "smax z18.s, p2/M, z18.s, z24.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z24.s\n"
+ "smax z20.s, p2/M, z20.s, z24.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z24.s\n"
+ "smax z22.s, p2/M, z22.s, z24.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x27]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
+ "smax z23.s, p2/M, z23.s, z24.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
"st1b { z20.b }, p1, [x23]\n"
"addvl x27, x27, #1\n"
"28:" // Height 2: Writeback done
@@ -624,13 +624,13 @@ void sve_hybrid_u8qa_dot_4x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -639,8 +639,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"34:" // Height 3: input setup done
"cmp x25, #0x10\n"
"ble 37f\n"
@@ -650,73 +650,73 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "udot z24.s, z4.b, z2.b[0]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z21.s, z5.b, z1.b[0]\n"
- "udot z25.s, z5.b, z2.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "udot z26.s, z6.b, z2.b[0]\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "udot z16.s, z28.b, z0.b[0]\n"
+ "udot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z28.b, z2.b[0]\n"
+ "udot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z21.s, z30.b, z1.b[0]\n"
+ "udot z25.s, z30.b, z2.b[0]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "udot z18.s, z29.b, z0.b[0]\n"
+ "udot z22.s, z29.b, z1.b[0]\n"
+ "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "udot z26.s, z29.b, z2.b[0]\n"
+ "udot z19.s, z28.b, z0.b[0]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "udot z23.s, z7.b, z1.b[0]\n"
- "udot z27.s, z7.b, z2.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "udot z20.s, z8.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "udot z23.s, z28.b, z1.b[0]\n"
+ "udot z27.s, z28.b, z2.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "udot z16.s, z3.b, z0.b[1]\n"
+ "udot z20.s, z3.b, z1.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
"add x23, x23, #0x10\n"
- "udot z24.s, z8.b, z2.b[1]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "udot z24.s, z3.b, z2.b[1]\n"
+ "udot z17.s, z31.b, z0.b[1]\n"
+ "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
"add x22, x22, #0x10\n"
- "udot z21.s, z9.b, z1.b[1]\n"
- "udot z25.s, z9.b, z2.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "udot z22.s, z10.b, z1.b[1]\n"
- "udot z26.s, z10.b, z2.b[1]\n"
- "udot z19.s, z4.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "udot z23.s, z4.b, z1.b[1]\n"
- "udot z27.s, z4.b, z2.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "udot z20.s, z5.b, z1.b[2]\n"
- "udot z24.s, z5.b, z2.b[2]\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "udot z21.s, z6.b, z1.b[2]\n"
- "udot z25.s, z6.b, z2.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z22.s, z7.b, z1.b[2]\n"
- "udot z26.s, z7.b, z2.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
- "udot z23.s, z8.b, z1.b[2]\n"
- "udot z27.s, z8.b, z2.b[2]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z20.s, z9.b, z1.b[3]\n"
- "udot z24.s, z9.b, z2.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "udot z21.s, z10.b, z1.b[3]\n"
- "udot z25.s, z10.b, z2.b[3]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z22.s, z4.b, z1.b[3]\n"
- "udot z26.s, z4.b, z2.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z23.s, z5.b, z1.b[3]\n"
- "udot z27.s, z5.b, z2.b[3]\n"
+ "udot z21.s, z31.b, z1.b[1]\n"
+ "udot z25.s, z31.b, z2.b[1]\n"
+ "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "udot z18.s, z30.b, z0.b[1]\n"
+ "udot z22.s, z30.b, z1.b[1]\n"
+ "udot z26.s, z30.b, z2.b[1]\n"
+ "udot z19.s, z29.b, z0.b[1]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "udot z23.s, z29.b, z1.b[1]\n"
+ "udot z27.s, z29.b, z2.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "udot z16.s, z28.b, z0.b[2]\n"
+ "udot z20.s, z28.b, z1.b[2]\n"
+ "udot z24.s, z28.b, z2.b[2]\n"
+ "udot z17.s, z5.b, z0.b[2]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "udot z21.s, z5.b, z1.b[2]\n"
+ "udot z25.s, z5.b, z2.b[2]\n"
+ "udot z18.s, z4.b, z0.b[2]\n"
+ "udot z22.s, z4.b, z1.b[2]\n"
+ "udot z26.s, z4.b, z2.b[2]\n"
+ "udot z19.s, z3.b, z0.b[2]\n"
+ "udot z23.s, z3.b, z1.b[2]\n"
+ "udot z27.s, z3.b, z2.b[2]\n"
+ "udot z16.s, z31.b, z0.b[3]\n"
+ "udot z20.s, z31.b, z1.b[3]\n"
+ "udot z24.s, z31.b, z2.b[3]\n"
+ "udot z17.s, z30.b, z0.b[3]\n"
+ "udot z21.s, z30.b, z1.b[3]\n"
+ "udot z25.s, z30.b, z2.b[3]\n"
+ "udot z18.s, z29.b, z0.b[3]\n"
+ "udot z22.s, z29.b, z1.b[3]\n"
+ "udot z26.s, z29.b, z2.b[3]\n"
+ "udot z19.s, z28.b, z0.b[3]\n"
+ "udot z23.s, z28.b, z1.b[3]\n"
+ "udot z27.s, z28.b, z2.b[3]\n"
"tbnz %x[flags], #31, 36f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
@@ -731,79 +731,79 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1rqb { z1.b }, p0/Z, [x23]\n"
"subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "udot z24.s, z4.b, z2.b[0]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z21.s, z5.b, z1.b[0]\n"
- "udot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28]\n"
+ "udot z16.s, z28.b, z0.b[0]\n"
+ "udot z20.s, z28.b, z1.b[0]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z28.b, z2.b[0]\n"
+ "udot z17.s, z30.b, z0.b[0]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z21.s, z30.b, z1.b[0]\n"
+ "udot z25.s, z30.b, z2.b[0]\n"
"addvl x28, x28, #4\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z22.s, z6.b, z1.b[0]\n"
- "udot z26.s, z6.b, z2.b[0]\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "udot z23.s, z7.b, z1.b[0]\n"
- "udot z27.s, z7.b, z2.b[0]\n"
+ "udot z18.s, z29.b, z0.b[0]\n"
+ "udot z22.s, z29.b, z1.b[0]\n"
+ "udot z26.s, z29.b, z2.b[0]\n"
+ "udot z19.s, z28.b, z0.b[0]\n"
+ "udot z23.s, z28.b, z1.b[0]\n"
+ "udot z27.s, z28.b, z2.b[0]\n"
"ble 38f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z31.b }, p2/Z, [x28]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[1]\n"
- "udot z24.s, z8.b, z2.b[1]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z16.s, z31.b, z0.b[1]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z20.s, z31.b, z1.b[1]\n"
+ "udot z24.s, z31.b, z2.b[1]\n"
+ "udot z17.s, z30.b, z0.b[1]\n"
+ "udot z21.s, z30.b, z1.b[1]\n"
"addvl x28, x28, #4\n"
- "udot z25.s, z9.b, z2.b[1]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "udot z22.s, z10.b, z1.b[1]\n"
- "udot z26.s, z10.b, z2.b[1]\n"
- "udot z19.s, z4.b, z0.b[1]\n"
- "udot z23.s, z4.b, z1.b[1]\n"
- "udot z27.s, z4.b, z2.b[1]\n"
+ "udot z25.s, z30.b, z2.b[1]\n"
+ "udot z18.s, z29.b, z0.b[1]\n"
+ "udot z22.s, z29.b, z1.b[1]\n"
+ "udot z26.s, z29.b, z2.b[1]\n"
+ "udot z19.s, z28.b, z0.b[1]\n"
+ "udot z23.s, z28.b, z1.b[1]\n"
+ "udot z27.s, z28.b, z2.b[1]\n"
"ble 38f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z31.b }, p2/Z, [x28]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z20.s, z5.b, z1.b[2]\n"
- "udot z24.s, z5.b, z2.b[2]\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z31.b, z0.b[2]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z20.s, z31.b, z1.b[2]\n"
+ "udot z24.s, z31.b, z2.b[2]\n"
+ "udot z17.s, z30.b, z0.b[2]\n"
+ "udot z21.s, z30.b, z1.b[2]\n"
"addvl x28, x28, #4\n"
- "udot z25.s, z6.b, z2.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z22.s, z7.b, z1.b[2]\n"
- "udot z26.s, z7.b, z2.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
- "udot z23.s, z8.b, z1.b[2]\n"
- "udot z27.s, z8.b, z2.b[2]\n"
+ "udot z25.s, z30.b, z2.b[2]\n"
+ "udot z18.s, z29.b, z0.b[2]\n"
+ "udot z22.s, z29.b, z1.b[2]\n"
+ "udot z26.s, z29.b, z2.b[2]\n"
+ "udot z19.s, z28.b, z0.b[2]\n"
+ "udot z23.s, z28.b, z1.b[2]\n"
+ "udot z27.s, z28.b, z2.b[2]\n"
"ble 38f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z20.s, z9.b, z1.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z24.s, z9.b, z2.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "udot z21.s, z10.b, z1.b[3]\n"
- "udot z25.s, z10.b, z2.b[3]\n"
+ "ld1b { z31.b }, p2/Z, [x28]\n"
+ "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z31.b, z0.b[3]\n"
+ "udot z20.s, z31.b, z1.b[3]\n"
+ "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z24.s, z31.b, z2.b[3]\n"
+ "udot z17.s, z30.b, z0.b[3]\n"
+ "udot z21.s, z30.b, z1.b[3]\n"
+ "udot z25.s, z30.b, z2.b[3]\n"
"addvl x28, x28, #4\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z22.s, z4.b, z1.b[3]\n"
- "udot z26.s, z4.b, z2.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z23.s, z5.b, z1.b[3]\n"
- "udot z27.s, z5.b, z2.b[3]\n"
+ "udot z18.s, z29.b, z0.b[3]\n"
+ "udot z22.s, z29.b, z1.b[3]\n"
+ "udot z26.s, z29.b, z2.b[3]\n"
+ "udot z19.s, z28.b, z0.b[3]\n"
+ "udot z23.s, z28.b, z1.b[3]\n"
+ "udot z27.s, z28.b, z2.b[3]\n"
"38:" // Height 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 39f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -821,33 +821,33 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z3.s }, p2/Z, [x20]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
"uaddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"uaddv d12, p0, z12.s\n"
"uaddv d13, p0, z13.s\n"
"mov z12.s, z12.s[0]\n"
"mov z13.s, z13.s[0]\n"
- "neg z3.s, p2/M, z3.s\n"
- "mul z11.s, p2/M, z11.s, z3.s\n"
- "mul z12.s, p2/M, z12.s, z3.s\n"
- "mul z13.s, p2/M, z13.s, z3.s\n"
+ "neg z28.s, p2/M, z28.s\n"
+ "mul z11.s, p2/M, z11.s, z28.s\n"
+ "mul z12.s, p2/M, z12.s, z28.s\n"
+ "mul z13.s, p2/M, z13.s, z28.s\n"
"40:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
"ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z20.s, z20.s, z12.s\n"
"add z21.s, z21.s, z12.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z22.s, z22.s, z12.s\n"
"add z23.s, z23.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add z24.s, z24.s, z13.s\n"
"add z25.s, z25.s, z13.s\n"
@@ -855,133 +855,133 @@ void sve_hybrid_u8qa_dot_4x4VL (
"add z26.s, z26.s, z13.s\n"
"add z27.s, z27.s, z13.s\n"
"add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "add z17.s, z17.s, z31.s\n"
+ "add z18.s, z18.s, z30.s\n"
+ "add z19.s, z19.s, z29.s\n"
"add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z22.s, z22.s, z30.s\n"
+ "add z23.s, z23.s, z29.s\n"
"add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
+ "add z25.s, z25.s, z31.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "add z26.s, z26.s, z30.s\n"
+ "add z27.s, z27.s, z29.s\n"
+ ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n"
+ ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n"
+ ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n"
+ ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n"
+ ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n"
+ ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n"
+ ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n"
+ ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n"
+ ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n"
+ ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n"
+ ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n"
+ ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n"
"tbz %x[flags], #5, 41f\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "and z8.d, z20.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "sqadd z20.s, z20.s, z8.s\n"
- "and z9.d, z21.d, z0.d\n"
- "and z10.d, z22.d, z0.d\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z24.d, z0.d\n"
- "and z6.d, z25.d, z0.d\n"
- "and z7.d, z26.d, z0.d\n"
- "and z8.d, z27.d, z0.d\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z21.s, z21.s, z9.s\n"
- "sqadd z22.s, z22.s, z10.s\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z24.s, z24.s, z5.s\n"
- "sqadd z25.s, z25.s, z6.s\n"
- "sqadd z26.s, z26.s, z7.s\n"
- "sqadd z27.s, z27.s, z8.s\n"
+ "and z1.d, z16.d, z0.d\n"
+ "and z31.d, z17.d, z0.d\n"
+ "and z30.d, z18.d, z0.d\n"
+ "and z29.d, z19.d, z0.d\n"
+ "and z28.d, z20.d, z0.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z1.s\n"
+ "sqadd z17.s, z17.s, z31.s\n"
+ "sqadd z18.s, z18.s, z30.s\n"
+ "sqadd z19.s, z19.s, z29.s\n"
+ "sqadd z20.s, z20.s, z28.s\n"
+ "and z3.d, z21.d, z0.d\n"
+ "and z2.d, z22.d, z0.d\n"
+ "and z1.d, z23.d, z0.d\n"
+ "and z31.d, z24.d, z0.d\n"
+ "and z30.d, z25.d, z0.d\n"
+ "and z29.d, z26.d, z0.d\n"
+ "and z28.d, z27.d, z0.d\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z3.s\n"
+ "sqadd z22.s, z22.s, z2.s\n"
+ "sqadd z23.s, z23.s, z1.s\n"
+ "sqadd z24.s, z24.s, z31.s\n"
+ "sqadd z25.s, z25.s, z30.s\n"
+ "sqadd z26.s, z26.s, z29.s\n"
+ "sqadd z27.s, z27.s, z28.s\n"
"41:" // Height 3: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z28.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z28.s\n"
+ "add z18.s, z18.s, z28.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z28.s\n"
+ "add z20.s, z20.s, z28.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z28.s\n"
+ "add z22.s, z22.s, z28.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z24.s, z24.s, z28.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z28.s\n"
+ "add z26.s, z26.s, z28.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z27.s, z27.s, z4.s\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z28.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z29.s\n"
+ "smin z17.s, p2/M, z17.s, z29.s\n"
+ "smin z18.s, p2/M, z18.s, z29.s\n"
+ "smin z19.s, p2/M, z19.s, z29.s\n"
+ "smin z20.s, p2/M, z20.s, z29.s\n"
+ "smin z21.s, p2/M, z21.s, z29.s\n"
+ "smin z22.s, p2/M, z22.s, z29.s\n"
+ "smin z23.s, p2/M, z23.s, z29.s\n"
+ "smin z24.s, p2/M, z24.s, z29.s\n"
+ "smin z25.s, p2/M, z25.s, z29.s\n"
+ "smin z26.s, p2/M, z26.s, z29.s\n"
+ "smin z27.s, p2/M, z27.s, z29.s\n"
+ "smax z16.s, p2/M, z16.s, z28.s\n"
+ "smax z17.s, p2/M, z17.s, z28.s\n"
+ "smax z18.s, p2/M, z18.s, z28.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z28.s\n"
+ "smax z20.s, p2/M, z20.s, z28.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z28.s\n"
+ "smax z22.s, p2/M, z22.s, z28.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x27]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z28.s\n"
+ "smax z24.s, p2/M, z24.s, z28.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z28.s\n"
+ "smax z26.s, p2/M, z26.s, z28.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
"st1b { z20.b }, p1, [x23]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
+ "smax z27.s, p2/M, z27.s, z28.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
"st1b { z24.b }, p1, [x22]\n"
"addvl x27, x27, #1\n"
"42:" // Height 3: Writeback done
@@ -1027,14 +1027,14 @@ void sve_hybrid_u8qa_dot_4x4VL (
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1044,9 +1044,9 @@ void sve_hybrid_u8qa_dot_4x4VL (
"b 48f\n"
"47:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"48:" // Height 4: input setup done
"cmp x25, #0x10\n"
"ble 51f\n"
@@ -1059,88 +1059,88 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ld1rqb { z3.b }, p0/Z, [x21]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z24.s, z4.b, z2.b[0]\n"
- "udot z28.s, z4.b, z3.b[0]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[0]\n"
+ "udot z20.s, z5.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z24.s, z5.b, z2.b[0]\n"
+ "udot z28.s, z5.b, z3.b[0]\n"
+ "udot z17.s, z4.b, z0.b[0]\n"
+ "udot z21.s, z4.b, z1.b[0]\n"
"ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
- "udot z25.s, z5.b, z2.b[0]\n"
- "udot z29.s, z5.b, z3.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "udot z25.s, z4.b, z2.b[0]\n"
+ "udot z29.s, z4.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"addvl x28, x28, #16\n"
- "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
- "udot z26.s, z6.b, z2.b[0]\n"
- "udot z30.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
"add x21, x21, #0x10\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "udot z23.s, z7.b, z1.b[0]\n"
- "udot z27.s, z7.b, z2.b[0]\n"
- "udot z31.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "udot z27.s, z9.b, z2.b[0]\n"
+ "udot z31.s, z9.b, z3.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
"udot z16.s, z8.b, z0.b[1]\n"
"udot z20.s, z8.b, z1.b[1]\n"
"udot z24.s, z8.b, z2.b[1]\n"
"udot z28.s, z8.b, z3.b[1]\n"
"ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
- "udot z21.s, z9.b, z1.b[1]\n"
- "udot z25.s, z9.b, z2.b[1]\n"
- "udot z29.s, z9.b, z3.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "udot z22.s, z10.b, z1.b[1]\n"
- "udot z26.s, z10.b, z2.b[1]\n"
- "udot z30.s, z10.b, z3.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
- "udot z19.s, z4.b, z0.b[1]\n"
- "udot z23.s, z4.b, z1.b[1]\n"
- "udot z27.s, z4.b, z2.b[1]\n"
- "udot z31.s, z4.b, z3.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "udot z20.s, z5.b, z1.b[2]\n"
- "udot z24.s, z5.b, z2.b[2]\n"
- "udot z28.s, z5.b, z3.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
- "udot z17.s, z6.b, z0.b[2]\n"
- "udot z21.s, z6.b, z1.b[2]\n"
- "udot z25.s, z6.b, z2.b[2]\n"
- "udot z29.s, z6.b, z3.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z22.s, z7.b, z1.b[2]\n"
- "udot z26.s, z7.b, z2.b[2]\n"
- "udot z30.s, z7.b, z3.b[2]\n"
+ "udot z17.s, z7.b, z0.b[1]\n"
+ "udot z21.s, z7.b, z1.b[1]\n"
+ "udot z25.s, z7.b, z2.b[1]\n"
+ "udot z29.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[1]\n"
+ "udot z22.s, z6.b, z1.b[1]\n"
+ "udot z26.s, z6.b, z2.b[1]\n"
+ "udot z30.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "udot z19.s, z5.b, z0.b[1]\n"
+ "udot z23.s, z5.b, z1.b[1]\n"
+ "udot z27.s, z5.b, z2.b[1]\n"
+ "udot z31.s, z5.b, z3.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "udot z16.s, z4.b, z0.b[2]\n"
+ "udot z20.s, z4.b, z1.b[2]\n"
+ "udot z24.s, z4.b, z2.b[2]\n"
+ "udot z28.s, z4.b, z3.b[2]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ "udot z17.s, z10.b, z0.b[2]\n"
+ "udot z21.s, z10.b, z1.b[2]\n"
+ "udot z25.s, z10.b, z2.b[2]\n"
+ "udot z29.s, z10.b, z3.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z26.s, z9.b, z2.b[2]\n"
+ "udot z30.s, z9.b, z3.b[2]\n"
"udot z19.s, z8.b, z0.b[2]\n"
"udot z23.s, z8.b, z1.b[2]\n"
"udot z27.s, z8.b, z2.b[2]\n"
"udot z31.s, z8.b, z3.b[2]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z20.s, z9.b, z1.b[3]\n"
- "udot z24.s, z9.b, z2.b[3]\n"
- "udot z28.s, z9.b, z3.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "udot z21.s, z10.b, z1.b[3]\n"
- "udot z25.s, z10.b, z2.b[3]\n"
- "udot z29.s, z10.b, z3.b[3]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z22.s, z4.b, z1.b[3]\n"
- "udot z26.s, z4.b, z2.b[3]\n"
- "udot z30.s, z4.b, z3.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z23.s, z5.b, z1.b[3]\n"
- "udot z27.s, z5.b, z2.b[3]\n"
- "udot z31.s, z5.b, z3.b[3]\n"
+ "udot z16.s, z7.b, z0.b[3]\n"
+ "udot z20.s, z7.b, z1.b[3]\n"
+ "udot z24.s, z7.b, z2.b[3]\n"
+ "udot z28.s, z7.b, z3.b[3]\n"
+ "udot z17.s, z6.b, z0.b[3]\n"
+ "udot z21.s, z6.b, z1.b[3]\n"
+ "udot z25.s, z6.b, z2.b[3]\n"
+ "udot z29.s, z6.b, z3.b[3]\n"
+ "udot z18.s, z5.b, z0.b[3]\n"
+ "udot z22.s, z5.b, z1.b[3]\n"
+ "udot z26.s, z5.b, z2.b[3]\n"
+ "udot z30.s, z5.b, z3.b[3]\n"
+ "udot z19.s, z4.b, z0.b[3]\n"
+ "udot z23.s, z4.b, z1.b[3]\n"
+ "udot z27.s, z4.b, z2.b[3]\n"
+ "udot z31.s, z4.b, z3.b[3]\n"
"tbnz %x[flags], #31, 50f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
@@ -1157,95 +1157,95 @@ void sve_hybrid_u8qa_dot_4x4VL (
"subs x25, x25, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x22]\n"
"ld1rqb { z3.b }, p0/Z, [x21]\n"
- "ld1b { z4.b }, p2/Z, [x28]\n"
- "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z4.b, z0.b[0]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z24.s, z4.b, z2.b[0]\n"
- "udot z28.s, z4.b, z3.b[0]\n"
- "udot z17.s, z5.b, z0.b[0]\n"
- "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z7.b, z0.b[0]\n"
+ "udot z20.s, z7.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[0]\n"
+ "udot z28.s, z7.b, z3.b[0]\n"
+ "udot z17.s, z6.b, z0.b[0]\n"
+ "udot z21.s, z6.b, z1.b[0]\n"
"addvl x28, x28, #4\n"
- "udot z25.s, z5.b, z2.b[0]\n"
- "udot z29.s, z5.b, z3.b[0]\n"
- "udot z18.s, z6.b, z0.b[0]\n"
- "udot z22.s, z6.b, z1.b[0]\n"
- "udot z26.s, z6.b, z2.b[0]\n"
- "udot z30.s, z6.b, z3.b[0]\n"
- "udot z19.s, z7.b, z0.b[0]\n"
- "udot z23.s, z7.b, z1.b[0]\n"
- "udot z27.s, z7.b, z2.b[0]\n"
- "udot z31.s, z7.b, z3.b[0]\n"
+ "udot z25.s, z6.b, z2.b[0]\n"
+ "udot z29.s, z6.b, z3.b[0]\n"
+ "udot z18.s, z5.b, z0.b[0]\n"
+ "udot z22.s, z5.b, z1.b[0]\n"
+ "udot z26.s, z5.b, z2.b[0]\n"
+ "udot z30.s, z5.b, z3.b[0]\n"
+ "udot z19.s, z4.b, z0.b[0]\n"
+ "udot z23.s, z4.b, z1.b[0]\n"
+ "udot z27.s, z4.b, z2.b[0]\n"
+ "udot z31.s, z4.b, z3.b[0]\n"
"ble 52f\n"
- "ld1b { z8.b }, p2/Z, [x28]\n"
- "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z16.s, z7.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[1]\n"
- "udot z24.s, z8.b, z2.b[1]\n"
- "udot z28.s, z8.b, z3.b[1]\n"
- "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z20.s, z7.b, z1.b[1]\n"
+ "udot z24.s, z7.b, z2.b[1]\n"
+ "udot z28.s, z7.b, z3.b[1]\n"
+ "udot z17.s, z6.b, z0.b[1]\n"
"addvl x28, x28, #4\n"
- "udot z21.s, z9.b, z1.b[1]\n"
- "udot z25.s, z9.b, z2.b[1]\n"
- "udot z29.s, z9.b, z3.b[1]\n"
- "udot z18.s, z10.b, z0.b[1]\n"
- "udot z22.s, z10.b, z1.b[1]\n"
- "udot z26.s, z10.b, z2.b[1]\n"
- "udot z30.s, z10.b, z3.b[1]\n"
+ "udot z21.s, z6.b, z1.b[1]\n"
+ "udot z25.s, z6.b, z2.b[1]\n"
+ "udot z29.s, z6.b, z3.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z2.b[1]\n"
+ "udot z30.s, z5.b, z3.b[1]\n"
"udot z19.s, z4.b, z0.b[1]\n"
"udot z23.s, z4.b, z1.b[1]\n"
"udot z27.s, z4.b, z2.b[1]\n"
"udot z31.s, z4.b, z3.b[1]\n"
"ble 52f\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
"ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x4\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z20.s, z5.b, z1.b[2]\n"
- "udot z24.s, z5.b, z2.b[2]\n"
- "udot z28.s, z5.b, z3.b[2]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "udot z24.s, z7.b, z2.b[2]\n"
+ "udot z28.s, z7.b, z3.b[2]\n"
"udot z17.s, z6.b, z0.b[2]\n"
"addvl x28, x28, #4\n"
"udot z21.s, z6.b, z1.b[2]\n"
"udot z25.s, z6.b, z2.b[2]\n"
"udot z29.s, z6.b, z3.b[2]\n"
- "udot z18.s, z7.b, z0.b[2]\n"
- "udot z22.s, z7.b, z1.b[2]\n"
- "udot z26.s, z7.b, z2.b[2]\n"
- "udot z30.s, z7.b, z3.b[2]\n"
- "udot z19.s, z8.b, z0.b[2]\n"
- "udot z23.s, z8.b, z1.b[2]\n"
- "udot z27.s, z8.b, z2.b[2]\n"
- "udot z31.s, z8.b, z3.b[2]\n"
+ "udot z18.s, z5.b, z0.b[2]\n"
+ "udot z22.s, z5.b, z1.b[2]\n"
+ "udot z26.s, z5.b, z2.b[2]\n"
+ "udot z30.s, z5.b, z3.b[2]\n"
+ "udot z19.s, z4.b, z0.b[2]\n"
+ "udot z23.s, z4.b, z1.b[2]\n"
+ "udot z27.s, z4.b, z2.b[2]\n"
+ "udot z31.s, z4.b, z3.b[2]\n"
"ble 52f\n"
- "ld1b { z9.b }, p2/Z, [x28]\n"
- "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
- "udot z16.s, z9.b, z0.b[3]\n"
- "udot z20.s, z9.b, z1.b[3]\n"
- "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
- "udot z24.s, z9.b, z2.b[3]\n"
- "udot z28.s, z9.b, z3.b[3]\n"
- "udot z17.s, z10.b, z0.b[3]\n"
- "udot z21.s, z10.b, z1.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x28]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z16.s, z7.b, z0.b[3]\n"
+ "udot z20.s, z7.b, z1.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[3]\n"
+ "udot z28.s, z7.b, z3.b[3]\n"
+ "udot z17.s, z6.b, z0.b[3]\n"
+ "udot z21.s, z6.b, z1.b[3]\n"
"addvl x28, x28, #4\n"
- "udot z25.s, z10.b, z2.b[3]\n"
- "udot z29.s, z10.b, z3.b[3]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z22.s, z4.b, z1.b[3]\n"
- "udot z26.s, z4.b, z2.b[3]\n"
- "udot z30.s, z4.b, z3.b[3]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z23.s, z5.b, z1.b[3]\n"
- "udot z27.s, z5.b, z2.b[3]\n"
- "udot z31.s, z5.b, z3.b[3]\n"
+ "udot z25.s, z6.b, z2.b[3]\n"
+ "udot z29.s, z6.b, z3.b[3]\n"
+ "udot z18.s, z5.b, z0.b[3]\n"
+ "udot z22.s, z5.b, z1.b[3]\n"
+ "udot z26.s, z5.b, z2.b[3]\n"
+ "udot z30.s, z5.b, z3.b[3]\n"
+ "udot z19.s, z4.b, z0.b[3]\n"
+ "udot z23.s, z4.b, z1.b[3]\n"
+ "udot z27.s, z4.b, z2.b[3]\n"
+ "udot z31.s, z4.b, z3.b[3]\n"
"52:" // Height 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 53f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -1265,7 +1265,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov x20, #0x4\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[qp], %[b_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
"uaddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"uaddv d12, p0, z12.s\n"
@@ -1273,28 +1273,28 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov z12.s, z12.s[0]\n"
"mov z13.s, z13.s[0]\n"
"uaddv d14, p0, z14.s\n"
- "neg z4.s, p2/M, z4.s\n"
+ "neg z0.s, p2/M, z0.s\n"
"mov z14.s, z14.s[0]\n"
- "mul z11.s, p2/M, z11.s, z4.s\n"
- "mul z12.s, p2/M, z12.s, z4.s\n"
- "mul z13.s, p2/M, z13.s, z4.s\n"
- "mul z14.s, p2/M, z14.s, z4.s\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
+ "mul z12.s, p2/M, z12.s, z0.s\n"
+ "mul z13.s, p2/M, z13.s, z0.s\n"
+ "mul z14.s, p2/M, z14.s, z0.s\n"
"54:" // Height 4: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z20.s, z20.s, z12.s\n"
"add z21.s, z21.s, z12.s\n"
"add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z22.s, z22.s, z12.s\n"
"add z23.s, z23.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
"add x20, %x[qp], %[per_layer_right_shift]\n"
"add z24.s, z24.s, z13.s\n"
"add z25.s, z25.s, z13.s\n"
@@ -1305,174 +1305,174 @@ void sve_hybrid_u8qa_dot_4x4VL (
"add z29.s, z29.s, z14.s\n"
"add z30.s, z30.s, z14.s\n"
"add z31.s, z31.s, z14.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z21.s, z21.s, z1.s\n"
- "add z22.s, z22.s, z2.s\n"
- "add z23.s, z23.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- "add z28.s, z28.s, z0.s\n"
- "add z29.s, z29.s, z1.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z0.s\n"
+ "add z18.s, z18.s, z3.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "add z21.s, z21.s, z0.s\n"
+ "add z22.s, z22.s, z3.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z0.s\n"
+ "add z26.s, z26.s, z3.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add z28.s, z28.s, z4.s\n"
+ "add z29.s, z29.s, z0.s\n"
"ld1rw { z0.s }, p2/Z, [x20]\n"
- "add z30.s, z30.s, z2.s\n"
- "add z31.s, z31.s, z3.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
- ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
- ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
- ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
- ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ "add z30.s, z30.s, z3.s\n"
+ "add z31.s, z31.s, z2.s\n"
+ ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n"
+ ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n"
+ ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n"
+ ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n"
+ ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n"
+ ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n"
+ ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n"
+ ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n"
+ ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n"
+ ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n"
"tbz %x[flags], #5, 55f\n"
- "and z4.d, z16.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z16.s, z16.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "and z8.d, z20.d, z0.d\n"
- "and z9.d, z21.d, z0.d\n"
- "and z10.d, z22.d, z0.d\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z24.d, z0.d\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "and z2.d, z16.d, z0.d\n"
+ "and z1.d, z17.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z17.s, z17.s, z1.s\n"
+ "and z7.d, z18.d, z0.d\n"
+ "and z6.d, z19.d, z0.d\n"
+ "and z5.d, z20.d, z0.d\n"
+ "and z4.d, z21.d, z0.d\n"
+ "and z3.d, z22.d, z0.d\n"
+ "and z2.d, z23.d, z0.d\n"
+ "and z1.d, z24.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "sqadd z20.s, z20.s, z8.s\n"
- "sqadd z21.s, z21.s, z9.s\n"
- "sqadd z22.s, z22.s, z10.s\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z24.s, z24.s, z5.s\n"
- "and z6.d, z25.d, z0.d\n"
- "and z7.d, z26.d, z0.d\n"
- "and z8.d, z27.d, z0.d\n"
- "and z9.d, z28.d, z0.d\n"
- "and z10.d, z29.d, z0.d\n"
- "and z4.d, z30.d, z0.d\n"
- "and z5.d, z31.d, z0.d\n"
"asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z7.s\n"
+ "sqadd z19.s, z19.s, z6.s\n"
+ "sqadd z20.s, z20.s, z5.s\n"
+ "sqadd z21.s, z21.s, z4.s\n"
+ "sqadd z22.s, z22.s, z3.s\n"
+ "sqadd z23.s, z23.s, z2.s\n"
+ "sqadd z24.s, z24.s, z1.s\n"
+ "and z7.d, z25.d, z0.d\n"
+ "and z6.d, z26.d, z0.d\n"
+ "and z5.d, z27.d, z0.d\n"
+ "and z4.d, z28.d, z0.d\n"
+ "and z3.d, z29.d, z0.d\n"
+ "and z2.d, z30.d, z0.d\n"
+ "and z1.d, z31.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z25.s, z25.s, z6.s\n"
- "sqadd z26.s, z26.s, z7.s\n"
- "sqadd z27.s, z27.s, z8.s\n"
- "sqadd z28.s, z28.s, z9.s\n"
- "sqadd z29.s, z29.s, z10.s\n"
- "sqadd z30.s, z30.s, z4.s\n"
- "sqadd z31.s, z31.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z26.s, z26.s, z6.s\n"
+ "sqadd z27.s, z27.s, z5.s\n"
+ "sqadd z28.s, z28.s, z4.s\n"
+ "sqadd z29.s, z29.s, z3.s\n"
+ "sqadd z30.s, z30.s, z2.s\n"
+ "sqadd z31.s, z31.s, z1.s\n"
"55:" // Height 4: no shift correction
"add x20, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x20]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z20.s, z20.s, z4.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z20.s, z20.s, z2.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
- "add z21.s, z21.s, z4.s\n"
- "add z22.s, z22.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
- "add z27.s, z27.s, z4.s\n"
- "add z28.s, z28.s, z4.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
- "add z29.s, z29.s, z4.s\n"
- "add z30.s, z30.s, z4.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z30.s, z30.s, z2.s\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"add x20, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x20]\n"
- "add z31.s, z31.s, z4.s\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add z31.s, z31.s, z2.s\n"
"add x20, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x20]\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "smin z16.s, p2/M, z16.s, z1.s\n"
+ "smin z17.s, p2/M, z17.s, z1.s\n"
+ "smin z18.s, p2/M, z18.s, z1.s\n"
+ "smin z19.s, p2/M, z19.s, z1.s\n"
+ "smin z20.s, p2/M, z20.s, z1.s\n"
+ "smin z21.s, p2/M, z21.s, z1.s\n"
+ "smin z22.s, p2/M, z22.s, z1.s\n"
+ "smin z23.s, p2/M, z23.s, z1.s\n"
+ "smin z24.s, p2/M, z24.s, z1.s\n"
+ "smin z25.s, p2/M, z25.s, z1.s\n"
+ "smin z26.s, p2/M, z26.s, z1.s\n"
+ "smin z27.s, p2/M, z27.s, z1.s\n"
+ "smin z28.s, p2/M, z28.s, z1.s\n"
+ "smin z29.s, p2/M, z29.s, z1.s\n"
+ "smin z30.s, p2/M, z30.s, z1.s\n"
+ "smin z31.s, p2/M, z31.s, z1.s\n"
+ "smax z16.s, p2/M, z16.s, z0.s\n"
+ "smax z17.s, p2/M, z17.s, z0.s\n"
+ "smax z18.s, p2/M, z18.s, z0.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z0.s\n"
+ "smax z20.s, p2/M, z20.s, z0.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z21.s, p2/M, z21.s, z0.s\n"
+ "smax z22.s, p2/M, z22.s, z0.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"st1b { z16.b }, p1, [x27]\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z21.h, z22.h, z23.h\n"
- "uzp1 z20.b, z20.b, z21.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z0.s\n"
+ "smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z0.s\n"
+ "smax z26.s, p2/M, z26.s, z0.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
"st1b { z20.b }, p1, [x23]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "smax z28.s, p2/M, z28.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "smax z29.s, p2/M, z29.s, z5.s\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z0.s\n"
+ "smax z28.s, p2/M, z28.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "smax z29.s, p2/M, z29.s, z0.s\n"
+ "smax z30.s, p2/M, z30.s, z0.s\n"
"uzp1 z28.h, z28.h, z29.h\n"
"st1b { z24.b }, p1, [x22]\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "uzp1 z29.h, z30.h, z31.h\n"
- "uzp1 z28.b, z28.b, z29.b\n"
+ "smax z31.s, p2/M, z31.s, z0.s\n"
+ "uzp1 z16.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z16.b\n"
"st1b { z28.b }, p1, [x21]\n"
"addvl x27, x27, #1\n"
"56:" // Height 4: Writeback done
@@ -1491,7 +1491,6 @@ void sve_hybrid_u8qa_dot_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1499,4 +1498,4 @@ void sve_hybrid_u8qa_dot_4x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
index da27554a0f..5de68cc738 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, uint8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -97,5 +96,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
index f9d38c2925..69894bec41 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
@@ -108,11 +108,11 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 5f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
"cbnz x26, 6f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -125,41 +125,41 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"7:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99814 // ummla z20.s, z0.b, z25.b\n"
+ ".inst 0x45d89811 // ummla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45da9815 // ummla z21.s, z0.b, z26.b\n"
+ ".inst 0x45d99812 // ummla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
+ ".inst 0x45d89816 // ummla z22.s, z0.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45da9813 // ummla z19.s, z0.b, z26.b\n"
+ ".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45da9834 // ummla z20.s, z1.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n"
+ ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n"
+ ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
+ ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
"add x24, x24, #0x10\n"
"tbnz %x[flags], #31, 8f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -171,43 +171,43 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x45da9814 // ummla z20.s, z0.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99811 // ummla z17.s, z0.b, z25.b\n"
+ ".inst 0x45d89815 // ummla z21.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n"
+ ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n"
+ ".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n"
+ ".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n"
"addvl x28, x28, #8\n"
"ble 10f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45d89834 // ummla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99832 // ummla z18.s, z1.b, z25.b\n"
+ ".inst 0x45d89836 // ummla z22.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
+ ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
"addvl x28, x28, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"tbnz %x[flags], #31, 11f\n"
@@ -224,74 +224,74 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"uzp1 z19.d, z19.d, z23.d\n"
"mov z23.d, z16.d\n"
"tbnz %x[flags], #31, 12f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z1.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "neg z1.s, p2/M, z1.s\n"
+ "neg z16.s, p2/M, z16.s\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z1.s\n"
+ "mul z11.s, p2/M, z11.s, z16.s\n"
"12:" // Height 1: skip row sum fixup
"add z23.s, z23.s, z11.s\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x10]\n"
+ "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z18.s, z18.s, z11.s\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
- "add z23.s, z23.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ "add z23.s, z23.s, z22.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z20.s\n"
+ "add z19.s, z19.s, z16.s\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n"
"addvl x10, x10, #4\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
"tbz %x[flags], #5, 13f\n"
- "and z4.d, z23.d, z0.d\n"
- "and z5.d, z17.d, z0.d\n"
- "and z6.d, z18.d, z0.d\n"
- "and z7.d, z19.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "sqadd z17.s, z17.s, z5.s\n"
- "sqadd z18.s, z18.s, z6.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
+ "and z22.d, z23.d, z0.d\n"
+ "and z21.d, z17.d, z0.d\n"
+ "and z20.d, z18.d, z0.d\n"
+ "and z16.d, z19.d, z0.d\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z22.s\n"
+ "sqadd z17.s, z17.s, z21.s\n"
+ "sqadd z18.s, z18.s, z20.s\n"
+ "sqadd z19.s, z19.s, z16.s\n"
"13:" // Height 1: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z23.s, z23.s, z16.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z20.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z16.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z16.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z20.s\n"
+ "smin z17.s, p2/M, z17.s, z20.s\n"
+ "smin z18.s, p2/M, z18.s, z20.s\n"
+ "smin z19.s, p2/M, z19.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z16.s\n"
+ "smax z17.s, p2/M, z17.s, z16.s\n"
+ "smax z18.s, p2/M, z18.s, z16.s\n"
"uzp1 z23.h, z23.h, z17.h\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "uzp1 z17.h, z18.h, z19.h\n"
- "uzp1 z23.b, z23.b, z17.b\n"
+ "smax z19.s, p2/M, z19.s, z16.s\n"
+ "uzp1 z16.h, z18.h, z19.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
"st1b { z23.b }, p1, [x27]\n"
"addvl x27, x27, #1\n"
"14:" // Height 1: Writeback done
@@ -324,12 +324,12 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 19f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
"cbnz x26, 20f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -337,49 +337,49 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"b 20f\n"
"19:" // Height 2: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
+ "add x23, x24, x21\n"
"20:" // Height 2: input setup done
"cmp x25, #0x10\n"
"ble 23f\n"
"21:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1rqb { z26.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "trn2 z1.d, z1.d, z26.d\n"
+ "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99814 // ummla z20.s, z0.b, z25.b\n"
+ ".inst 0x45d89811 // ummla z17.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45da9815 // ummla z21.s, z0.b, z26.b\n"
+ ".inst 0x45d99812 // ummla z18.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
+ ".inst 0x45d89816 // ummla z22.s, z0.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45da9813 // ummla z19.s, z0.b, z26.b\n"
+ ".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45da9834 // ummla z20.s, z1.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n"
+ ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n"
+ ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
+ ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
"tbnz %x[flags], #31, 22f\n"
@@ -392,44 +392,44 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"23:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x25\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "ld1rqb { z2.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn1 z0.d, z1.d, z27.d\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
"subs x25, x25, #0x8\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "trn2 z1.d, z1.d, z27.d\n"
+ ".inst 0x45da9814 // ummla z20.s, z0.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99811 // ummla z17.s, z0.b, z25.b\n"
+ ".inst 0x45d89815 // ummla z21.s, z0.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n"
+ ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n"
+ ".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n"
+ ".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n"
"addvl x28, x28, #8\n"
"ble 24f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
+ "ld1b { z24.b }, p2/Z, [x28]\n"
+ ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n"
+ "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45d89834 // ummla z20.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n"
+ ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45d99832 // ummla z18.s, z1.b, z25.b\n"
+ ".inst 0x45d89836 // ummla z22.s, z1.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n"
+ ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n"
"addvl x28, x28, #8\n"
"24:" // Height 2: Multiply loop: multiply skip
"tbnz %x[flags], #31, 25f\n"
@@ -440,133 +440,133 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"add x26, x26, #0x1\n"
"cmp x26, x20\n"
"bne 18b\n"
- "uzp1 z7.d, z16.d, z20.d\n"
+ "uzp1 z24.d, z16.d, z20.d\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "add x22, x27, x20\n"
+ "add x23, x27, x20\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "mov z23.d, z7.d\n"
+ "mov z23.d, z24.d\n"
"tbnz %x[flags], #31, 26f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z2.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
- "neg z2.s, p2/M, z2.s\n"
+ "neg z24.s, p2/M, z24.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z2.s\n"
- "mul z12.s, p2/M, z12.s, z2.s\n"
+ "mul z11.s, p2/M, z11.s, z24.s\n"
+ "mul z12.s, p2/M, z12.s, z24.s\n"
"26:" // Height 2: skip row sum fixup
"add z23.s, z23.s, z11.s\n"
"add z20.s, z20.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x10]\n"
+ "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z21.s, z21.s, z11.s\n"
"add z22.s, z22.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z16.s, z16.s, z12.s\n"
"add z17.s, z17.s, z12.s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z18.s, z18.s, z12.s\n"
"add z19.s, z19.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
- "add z23.s, z23.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z23.s, z23.s, z28.s\n"
+ "add z20.s, z20.s, z27.s\n"
"addvl x10, x10, #4\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ "add z21.s, z21.s, z26.s\n"
+ "add z22.s, z22.s, z25.s\n"
+ "add z16.s, z16.s, z28.s\n"
+ "add z17.s, z17.s, z27.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z18.s, z18.s, z26.s\n"
+ "add z19.s, z19.s, z25.s\n"
+ ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n"
+ ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
+ ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
+ ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
"tbz %x[flags], #5, 27f\n"
- "and z4.d, z23.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z23.s, z23.s, z4.s\n"
- "and z5.d, z20.d, z0.d\n"
- "and z6.d, z21.d, z0.d\n"
- "and z7.d, z22.d, z0.d\n"
- "and z8.d, z16.d, z0.d\n"
- "and z9.d, z17.d, z0.d\n"
- "and z10.d, z18.d, z0.d\n"
- "and z4.d, z19.d, z0.d\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "sqadd z16.s, z16.s, z8.s\n"
- "sqadd z17.s, z17.s, z9.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- "sqadd z19.s, z19.s, z4.s\n"
+ "and z24.d, z23.d, z0.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z24.s\n"
+ "and z30.d, z20.d, z0.d\n"
+ "and z29.d, z21.d, z0.d\n"
+ "and z28.d, z22.d, z0.d\n"
+ "and z27.d, z16.d, z0.d\n"
+ "and z26.d, z17.d, z0.d\n"
+ "and z25.d, z18.d, z0.d\n"
+ "and z24.d, z19.d, z0.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z30.s\n"
+ "sqadd z21.s, z21.s, z29.s\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z16.s, z16.s, z27.s\n"
+ "sqadd z17.s, z17.s, z26.s\n"
+ "sqadd z18.s, z18.s, z25.s\n"
+ "sqadd z19.s, z19.s, z24.s\n"
"27:" // Height 2: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z23.s, z23.s, z24.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z24.s\n"
+ "add z21.s, z21.s, z24.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z24.s\n"
+ "add z16.s, z16.s, z24.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z24.s\n"
+ "add z18.s, z18.s, z24.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z19.s, z19.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z25.s }, p2/Z, [x20]\n"
+ "add z19.s, z19.s, z24.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z24.s }, p2/Z, [x20]\n"
+ "smin z23.s, p2/M, z23.s, z25.s\n"
+ "smin z20.s, p2/M, z20.s, z25.s\n"
+ "smin z21.s, p2/M, z21.s, z25.s\n"
+ "smin z22.s, p2/M, z22.s, z25.s\n"
+ "smin z16.s, p2/M, z16.s, z25.s\n"
+ "smin z17.s, p2/M, z17.s, z25.s\n"
+ "smin z18.s, p2/M, z18.s, z25.s\n"
+ "smin z19.s, p2/M, z19.s, z25.s\n"
+ "smax z23.s, p2/M, z23.s, z24.s\n"
+ "smax z20.s, p2/M, z20.s, z24.s\n"
+ "smax z21.s, p2/M, z21.s, z24.s\n"
"uzp1 z23.h, z23.h, z20.h\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z22.s, p2/M, z22.s, z24.s\n"
+ "smax z16.s, p2/M, z16.s, z24.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z23.b, z23.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z24.s\n"
+ "smax z18.s, p2/M, z18.s, z24.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
"st1b { z23.b }, p1, [x27]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z24.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x22]\n"
+ "st1b { z16.b }, p1, [x23]\n"
"addvl x27, x27, #1\n"
"28:" // Height 2: Writeback done
"decw x9, ALL, MUL #4\n"
@@ -607,13 +607,13 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 33f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
"cbnz x26, 34f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -622,8 +622,8 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"b 34f\n"
"33:" // Height 3: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"34:" // Height 3: input setup done
"cmp x25, #0x10\n"
"ble 37f\n"
@@ -634,60 +634,60 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ld1rqb { z3.b }, p0/Z, [x22]\n"
"trn1 z0.d, z1.d, z2.d\n"
"trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45c59814 // ummla z20.s, z0.b, z5.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45c5985c // ummla z28.s, z2.b, z5.b\n"
+ ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n"
+ ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n"
+ ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n"
+ ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n"
+ ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
"add x23, x23, #0x10\n"
".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
"add x22, x22, #0x10\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n"
+ ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n"
+ ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n"
+ ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n"
+ ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
+ ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
+ ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n"
+ ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n"
+ ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
- ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n"
+ ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n"
+ ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n"
"tbnz %x[flags], #31, 36f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z13.s, z2.b, z15.b\n"
@@ -708,56 +708,56 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"trn1 z2.d, z3.d, z4.d\n"
"trn2 z3.d, z3.d, z4.d\n"
".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
"subs x25, x25, #0x8\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n"
+ ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n"
+ ".inst 0x45c99859 // ummla z25.s, z2.b, z9.b\n"
".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
- ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
+ ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n"
+ ".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n"
+ ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n"
+ ".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n"
+ ".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n"
+ ".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n"
+ ".inst 0x45c49817 // ummla z23.s, z0.b, z4.b\n"
+ ".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n"
"ble 38f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45c59834 // ummla z20.s, z1.b, z5.b\n"
+ ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45c49831 // ummla z17.s, z1.b, z4.b\n"
+ ".inst 0x45c49879 // ummla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
+ ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
+ ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
+ ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n"
+ ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
- ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n"
+ ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n"
+ ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n"
"38:" // Height 3: Multiply loop: multiply skip
"tbnz %x[flags], #31, 39f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -770,12 +770,12 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"cmp x26, x20\n"
"bne 32b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z7.d, z16.d, z20.d\n"
- "add x22, x27, x20\n"
+ "uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
"uzp2 z17.d, z17.d, z21.d\n"
- "add x21, x22, x20\n"
+ "add x22, x23, x20\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
"uzp1 z22.d, z19.d, z23.d\n"
@@ -784,170 +784,170 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
"uzp1 z27.d, z27.d, z31.d\n"
- "mov z31.d, z7.d\n"
+ "mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 40f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z3.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "neg z3.s, p2/M, z3.s\n"
+ "neg z23.s, p2/M, z23.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z3.s\n"
+ "mul z11.s, p2/M, z11.s, z23.s\n"
"mov z13.s, z13.s[0]\n"
- "mul z12.s, p2/M, z12.s, z3.s\n"
- "mul z13.s, p2/M, z13.s, z3.s\n"
+ "mul z12.s, p2/M, z12.s, z23.s\n"
+ "mul z13.s, p2/M, z13.s, z23.s\n"
"40:" // Height 3: skip row sum fixup
"add z31.s, z31.s, z11.s\n"
"add z20.s, z20.s, z11.s\n"
"ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z21.s, z21.s, z11.s\n"
"add z22.s, z22.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z16.s, z16.s, z12.s\n"
"add z17.s, z17.s, z12.s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z18.s, z18.s, z12.s\n"
"add z19.s, z19.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add z24.s, z24.s, z13.s\n"
"add z25.s, z25.s, z13.s\n"
"addvl x10, x10, #4\n"
"add z26.s, z26.s, z13.s\n"
"add z27.s, z27.s, z13.s\n"
"add z31.s, z31.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
+ "add z20.s, z20.s, z30.s\n"
+ "add z21.s, z21.s, z29.s\n"
+ "add z22.s, z22.s, z28.s\n"
"add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
+ "add z17.s, z17.s, z30.s\n"
+ "add z18.s, z18.s, z29.s\n"
+ "add z19.s, z19.s, z28.s\n"
"add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z26.s, z26.s, z29.s\n"
+ "add z27.s, z27.s, z28.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n"
+ ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n"
+ ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n"
+ ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n"
+ ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n"
+ ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n"
"tbz %x[flags], #5, 41f\n"
- "and z4.d, z31.d, z0.d\n"
- "and z5.d, z20.d, z0.d\n"
- "and z6.d, z21.d, z0.d\n"
- "and z7.d, z22.d, z0.d\n"
- "and z8.d, z16.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z31.s, z31.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "sqadd z16.s, z16.s, z8.s\n"
- "and z9.d, z17.d, z0.d\n"
- "and z10.d, z18.d, z0.d\n"
- "and z4.d, z19.d, z0.d\n"
- "and z5.d, z24.d, z0.d\n"
- "and z6.d, z25.d, z0.d\n"
- "and z7.d, z26.d, z0.d\n"
- "and z8.d, z27.d, z0.d\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "sqadd z17.s, z17.s, z9.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- "sqadd z19.s, z19.s, z4.s\n"
- "sqadd z24.s, z24.s, z5.s\n"
- "sqadd z25.s, z25.s, z6.s\n"
- "sqadd z26.s, z26.s, z7.s\n"
- "sqadd z27.s, z27.s, z8.s\n"
+ "and z1.d, z31.d, z0.d\n"
+ "and z30.d, z20.d, z0.d\n"
+ "and z29.d, z21.d, z0.d\n"
+ "and z28.d, z22.d, z0.d\n"
+ "and z23.d, z16.d, z0.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z1.s\n"
+ "sqadd z20.s, z20.s, z30.s\n"
+ "sqadd z21.s, z21.s, z29.s\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "and z3.d, z17.d, z0.d\n"
+ "and z2.d, z18.d, z0.d\n"
+ "and z1.d, z19.d, z0.d\n"
+ "and z30.d, z24.d, z0.d\n"
+ "and z29.d, z25.d, z0.d\n"
+ "and z28.d, z26.d, z0.d\n"
+ "and z23.d, z27.d, z0.d\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z3.s\n"
+ "sqadd z18.s, z18.s, z2.s\n"
+ "sqadd z19.s, z19.s, z1.s\n"
+ "sqadd z24.s, z24.s, z30.s\n"
+ "sqadd z25.s, z25.s, z29.s\n"
+ "sqadd z26.s, z26.s, z28.s\n"
+ "sqadd z27.s, z27.s, z23.s\n"
"41:" // Height 3: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "add z31.s, z31.s, z4.s\n"
+ "add z31.s, z31.s, z23.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z23.s\n"
+ "add z21.s, z21.s, z23.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z23.s\n"
+ "add z16.s, z16.s, z23.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z23.s\n"
+ "add z18.s, z18.s, z23.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z19.s, z19.s, z23.s\n"
+ "add z24.s, z24.s, z23.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z23.s\n"
+ "add z26.s, z26.s, z23.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z27.s, z27.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z28.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z23.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z23.s }, p2/Z, [x20]\n"
+ "smin z31.s, p2/M, z31.s, z28.s\n"
+ "smin z20.s, p2/M, z20.s, z28.s\n"
+ "smin z21.s, p2/M, z21.s, z28.s\n"
+ "smin z22.s, p2/M, z22.s, z28.s\n"
+ "smin z16.s, p2/M, z16.s, z28.s\n"
+ "smin z17.s, p2/M, z17.s, z28.s\n"
+ "smin z18.s, p2/M, z18.s, z28.s\n"
+ "smin z19.s, p2/M, z19.s, z28.s\n"
+ "smin z24.s, p2/M, z24.s, z28.s\n"
+ "smin z25.s, p2/M, z25.s, z28.s\n"
+ "smin z26.s, p2/M, z26.s, z28.s\n"
+ "smin z27.s, p2/M, z27.s, z28.s\n"
+ "smax z31.s, p2/M, z31.s, z23.s\n"
+ "smax z20.s, p2/M, z20.s, z23.s\n"
+ "smax z21.s, p2/M, z21.s, z23.s\n"
"uzp1 z31.h, z31.h, z20.h\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z22.s, p2/M, z22.s, z23.s\n"
+ "smax z16.s, p2/M, z16.s, z23.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z31.b, z31.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z23.s\n"
+ "smax z18.s, p2/M, z18.s, z23.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
"st1b { z31.b }, p1, [x27]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z23.s\n"
+ "smax z24.s, p2/M, z24.s, z23.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z23.s\n"
+ "smax z26.s, p2/M, z26.s, z23.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z16.b }, p1, [x22]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x21]\n"
+ "st1b { z16.b }, p1, [x23]\n"
+ "smax z27.s, p2/M, z27.s, z23.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x22]\n"
"addvl x27, x27, #1\n"
"42:" // Height 3: Writeback done
"decw x9, ALL, MUL #4\n"
@@ -992,14 +992,14 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w25, [x20, x26, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 47f\n"
- "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x24, [x21, #0x0]\n"
- "ldr x23, [x21, #0x8]\n"
- "ldr x22, [x21, #0x10]\n"
- "ldr x21, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x24, [x20, #0x0]\n"
+ "ldr x23, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x21, [x20, #0x18]\n"
"cbnz x26, 48f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x24, x24, x20\n"
@@ -1009,9 +1009,9 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"b 48f\n"
"47:" // Height 4: setup direct input
"mov x24, %x[input_ptr]\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"48:" // Height 4: input setup done
"cmp x25, #0x10\n"
"ble 51f\n"
@@ -1021,63 +1021,63 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ld1rqb { z2.b }, p0/Z, [x23]\n"
"trn1 z0.d, z1.d, z2.d\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "ld1rqb { z5.b }, p0/Z, [x21]\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
- ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n"
- "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
+ ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+ ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n"
+ ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x45c59811 // ummla z17.s, z0.b, z5.b\n"
+ ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x45c49815 // ummla z21.s, z0.b, z4.b\n"
+ ".inst 0x45c4985d // ummla z29.s, z2.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
"addvl x28, x28, #16\n"
- "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
- ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
- ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
- ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n"
+ ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n"
+ ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+ ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n"
+ ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+ ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n"
".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
- "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
"add x24, x24, #0x10\n"
".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
+ ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+ ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n"
"add x22, x22, #0x10\n"
- ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
+ ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n"
+ ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n"
"add x21, x21, #0x10\n"
- ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
+ ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n"
+ ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
+ ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
+ ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n"
+ ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n"
+ ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
- ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n"
+ ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n"
+ ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n"
"tbnz %x[flags], #31, 50f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z13.s, z2.b, z15.b\n"
@@ -1093,62 +1093,62 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"ld1rqb { z2.b }, p0/Z, [x23]\n"
"trn1 z0.d, z1.d, z2.d\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "ld1rqb { z5.b }, p0/Z, [x21]\n"
"trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z5.b }, p2/Z, [x28]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "trn1 z2.d, z3.d, z5.d\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "trn2 z3.d, z3.d, z5.d\n"
+ ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n"
+ ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
"subs x25, x25, #0x8\n"
- ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n"
+ ".inst 0x45c59814 // ummla z20.s, z0.b, z5.b\n"
"ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
- ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n"
- ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ ".inst 0x45c5985c // ummla z28.s, z2.b, z5.b\n"
+ ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n"
- "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n"
- ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n"
+ ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n"
- ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n"
- ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n"
- ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n"
- ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n"
- ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n"
- ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n"
+ ".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n"
+ ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n"
+ ".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n"
+ ".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n"
+ ".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n"
+ ".inst 0x45c49817 // ummla z23.s, z0.b, z4.b\n"
+ ".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n"
"ble 52f\n"
- "ld1b { z6.b }, p2/Z, [x28]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
- "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
- ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n"
- ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n"
- "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
- "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
- ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n"
- ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n"
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
+ ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ ".inst 0x45c59834 // ummla z20.s, z1.b, z5.b\n"
+ ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n"
+ "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+ ".inst 0x45c49831 // ummla z17.s, z1.b, z4.b\n"
+ ".inst 0x45c49879 // ummla z25.s, z3.b, z4.b\n"
"ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
- ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n"
- ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n"
- ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n"
- ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n"
+ ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n"
+ ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n"
"addvl x28, x28, #8\n"
- ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n"
- ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
+ ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n"
+ ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
- ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n"
- ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n"
+ ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n"
+ ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n"
"52:" // Height 4: Multiply loop: multiply skip
"tbnz %x[flags], #31, 53f\n"
"udot z11.s, z0.b, z15.b\n"
@@ -1161,12 +1161,12 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"cmp x26, x20\n"
"bne 46b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "uzp1 z7.d, z16.d, z20.d\n"
- "add x22, x27, x20\n"
- "add x21, x22, x20\n"
+ "uzp1 z0.d, z16.d, z20.d\n"
+ "add x23, x27, x20\n"
+ "add x22, x23, x20\n"
"uzp2 z16.d, z16.d, z20.d\n"
"uzp1 z20.d, z17.d, z21.d\n"
- "add x20, x21, x20\n"
+ "add x21, x22, x20\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
"uzp2 z18.d, z18.d, z22.d\n"
@@ -1180,38 +1180,38 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"uzp2 z26.d, z26.d, z30.d\n"
"uzp1 z30.d, z27.d, z31.d\n"
"uzp2 z27.d, z27.d, z31.d\n"
- "mov z31.d, z7.d\n"
+ "mov z31.d, z0.d\n"
"tbnz %x[flags], #31, 54f\n"
- "add x23, %x[qp], %[b_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n"
".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n"
- "neg z4.s, p2/M, z4.s\n"
+ "neg z0.s, p2/M, z0.s\n"
"mov z12.s, z11.s[3]\n"
"mov z11.s, z11.s[0]\n"
- "mul z11.s, p2/M, z11.s, z4.s\n"
+ "mul z11.s, p2/M, z11.s, z0.s\n"
"mov z14.s, z13.s[3]\n"
"mov z13.s, z13.s[0]\n"
- "mul z12.s, p2/M, z12.s, z4.s\n"
- "mul z13.s, p2/M, z13.s, z4.s\n"
- "mul z14.s, p2/M, z14.s, z4.s\n"
+ "mul z12.s, p2/M, z12.s, z0.s\n"
+ "mul z13.s, p2/M, z13.s, z0.s\n"
+ "mul z14.s, p2/M, z14.s, z0.s\n"
"54:" // Height 4: skip row sum fixup
"add z31.s, z31.s, z11.s\n"
"add z20.s, z20.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
"add z21.s, z21.s, z11.s\n"
"add z22.s, z22.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
"add z16.s, z16.s, z12.s\n"
"add z17.s, z17.s, z12.s\n"
- "add x23, %x[qp], %[per_layer_mul]\n"
+ "add x20, %x[qp], %[per_layer_mul]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z18.s, z18.s, z12.s\n"
"add z19.s, z19.s, z12.s\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
- "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
"add z23.s, z23.s, z13.s\n"
"add z28.s, z28.s, z13.s\n"
"addvl x10, x10, #4\n"
@@ -1221,175 +1221,175 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"add z25.s, z25.s, z14.s\n"
"add z26.s, z26.s, z14.s\n"
"add z27.s, z27.s, z14.s\n"
- "add z31.s, z31.s, z0.s\n"
- "add z20.s, z20.s, z1.s\n"
- "add z21.s, z21.s, z2.s\n"
- "add z22.s, z22.s, z3.s\n"
- "add z16.s, z16.s, z0.s\n"
- "add z17.s, z17.s, z1.s\n"
- "add z18.s, z18.s, z2.s\n"
- "add z19.s, z19.s, z3.s\n"
- "add z23.s, z23.s, z0.s\n"
- "add z28.s, z28.s, z1.s\n"
- "add z29.s, z29.s, z2.s\n"
- "add z30.s, z30.s, z3.s\n"
- "add z24.s, z24.s, z0.s\n"
- "add z25.s, z25.s, z1.s\n"
- "ld1rw { z0.s }, p2/Z, [x23]\n"
- "add z26.s, z26.s, z2.s\n"
- "add z27.s, z27.s, z3.s\n"
- ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
- ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
- ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
- ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
- ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
- ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
- ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
- ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
- ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
- ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
- ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
- ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
- ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
- ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z3.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z0.s\n"
+ "add z18.s, z18.s, z3.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "add z29.s, z29.s, z3.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z26.s, z26.s, z3.s\n"
+ "add z27.s, z27.s, z2.s\n"
+ ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n"
+ ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n"
+ ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n"
+ ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n"
+ ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n"
+ ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n"
+ ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n"
+ ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n"
+ ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n"
+ ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n"
+ ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
"tbz %x[flags], #5, 55f\n"
- "and z4.d, z31.d, z0.d\n"
- "and z5.d, z20.d, z0.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z31.s, z31.s, z4.s\n"
- "sqadd z20.s, z20.s, z5.s\n"
- "and z6.d, z21.d, z0.d\n"
- "and z7.d, z22.d, z0.d\n"
- "and z8.d, z16.d, z0.d\n"
- "and z9.d, z17.d, z0.d\n"
- "and z10.d, z18.d, z0.d\n"
- "and z4.d, z19.d, z0.d\n"
- "and z5.d, z23.d, z0.d\n"
- "asr z6.s, z6.s, #0x1f\n"
+ "and z2.d, z31.d, z0.d\n"
+ "and z1.d, z20.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z2.s\n"
+ "sqadd z20.s, z20.s, z1.s\n"
+ "and z7.d, z21.d, z0.d\n"
+ "and z6.d, z22.d, z0.d\n"
+ "and z5.d, z16.d, z0.d\n"
+ "and z4.d, z17.d, z0.d\n"
+ "and z3.d, z18.d, z0.d\n"
+ "and z2.d, z19.d, z0.d\n"
+ "and z1.d, z23.d, z0.d\n"
"asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z5.s, z5.s, #0x1f\n"
- "sqadd z21.s, z21.s, z6.s\n"
- "sqadd z22.s, z22.s, z7.s\n"
- "sqadd z16.s, z16.s, z8.s\n"
- "sqadd z17.s, z17.s, z9.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- "sqadd z19.s, z19.s, z4.s\n"
- "sqadd z23.s, z23.s, z5.s\n"
- "and z6.d, z28.d, z0.d\n"
- "and z7.d, z29.d, z0.d\n"
- "and z8.d, z30.d, z0.d\n"
- "and z9.d, z24.d, z0.d\n"
- "and z10.d, z25.d, z0.d\n"
- "and z4.d, z26.d, z0.d\n"
- "and z5.d, z27.d, z0.d\n"
"asr z6.s, z6.s, #0x1f\n"
- "asr z7.s, z7.s, #0x1f\n"
- "asr z8.s, z8.s, #0x1f\n"
- "asr z9.s, z9.s, #0x1f\n"
- "asr z10.s, z10.s, #0x1f\n"
+ "asr z5.s, z5.s, #0x1f\n"
"asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z7.s\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "sqadd z16.s, z16.s, z5.s\n"
+ "sqadd z17.s, z17.s, z4.s\n"
+ "sqadd z18.s, z18.s, z3.s\n"
+ "sqadd z19.s, z19.s, z2.s\n"
+ "sqadd z23.s, z23.s, z1.s\n"
+ "and z7.d, z28.d, z0.d\n"
+ "and z6.d, z29.d, z0.d\n"
+ "and z5.d, z30.d, z0.d\n"
+ "and z4.d, z24.d, z0.d\n"
+ "and z3.d, z25.d, z0.d\n"
+ "and z2.d, z26.d, z0.d\n"
+ "and z1.d, z27.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
"asr z5.s, z5.s, #0x1f\n"
- "sqadd z28.s, z28.s, z6.s\n"
- "sqadd z29.s, z29.s, z7.s\n"
- "sqadd z30.s, z30.s, z8.s\n"
- "sqadd z24.s, z24.s, z9.s\n"
- "sqadd z25.s, z25.s, z10.s\n"
- "sqadd z26.s, z26.s, z4.s\n"
- "sqadd z27.s, z27.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z7.s\n"
+ "sqadd z29.s, z29.s, z6.s\n"
+ "sqadd z30.s, z30.s, z5.s\n"
+ "sqadd z24.s, z24.s, z4.s\n"
+ "sqadd z25.s, z25.s, z3.s\n"
+ "sqadd z26.s, z26.s, z2.s\n"
+ "sqadd z27.s, z27.s, z1.s\n"
"55:" // Height 4: no shift correction
- "add x23, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x23]\n"
+ "add x20, %x[qp], %[c_offset]\n"
+ "ld1rw { z2.s }, p2/Z, [x20]\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
- "add z31.s, z31.s, z4.s\n"
+ "add z31.s, z31.s, z2.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z21.s, z21.s, z4.s\n"
+ "add z20.s, z20.s, z2.s\n"
+ "add z21.s, z21.s, z2.s\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
- "add z22.s, z22.s, z4.s\n"
- "add z16.s, z16.s, z4.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z16.s, z16.s, z2.s\n"
".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
- "add z17.s, z17.s, z4.s\n"
- "add z18.s, z18.s, z4.s\n"
+ "add z17.s, z17.s, z2.s\n"
+ "add z18.s, z18.s, z2.s\n"
".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
- "add z19.s, z19.s, z4.s\n"
- "add z23.s, z23.s, z4.s\n"
+ "add z19.s, z19.s, z2.s\n"
+ "add z23.s, z23.s, z2.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
- "add z28.s, z28.s, z4.s\n"
- "add z29.s, z29.s, z4.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "add z29.s, z29.s, z2.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
- "add z30.s, z30.s, z4.s\n"
- "add z24.s, z24.s, z4.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z24.s, z24.s, z2.s\n"
".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
- "add z25.s, z25.s, z4.s\n"
- "add z26.s, z26.s, z4.s\n"
+ "add z25.s, z25.s, z2.s\n"
+ "add z26.s, z26.s, z2.s\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
- "add x23, %x[qp], %[maxval]\n"
- "ld1rw { z6.s }, p2/Z, [x23]\n"
- "add z27.s, z27.s, z4.s\n"
- "add x23, %x[qp], %[minval]\n"
- "ld1rw { z5.s }, p2/Z, [x23]\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z20.s, p2/M, z20.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z16.s, p2/M, z16.s, z6.s\n"
- "smin z17.s, p2/M, z17.s, z6.s\n"
- "smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "smin z24.s, p2/M, z24.s, z6.s\n"
- "smin z25.s, p2/M, z25.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z5.s\n"
- "smax z20.s, p2/M, z20.s, z5.s\n"
- "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add x20, %x[qp], %[maxval]\n"
+ "ld1rw { z1.s }, p2/Z, [x20]\n"
+ "add z27.s, z27.s, z2.s\n"
+ "add x20, %x[qp], %[minval]\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "smin z31.s, p2/M, z31.s, z1.s\n"
+ "smin z20.s, p2/M, z20.s, z1.s\n"
+ "smin z21.s, p2/M, z21.s, z1.s\n"
+ "smin z22.s, p2/M, z22.s, z1.s\n"
+ "smin z16.s, p2/M, z16.s, z1.s\n"
+ "smin z17.s, p2/M, z17.s, z1.s\n"
+ "smin z18.s, p2/M, z18.s, z1.s\n"
+ "smin z19.s, p2/M, z19.s, z1.s\n"
+ "smin z23.s, p2/M, z23.s, z1.s\n"
+ "smin z28.s, p2/M, z28.s, z1.s\n"
+ "smin z29.s, p2/M, z29.s, z1.s\n"
+ "smin z30.s, p2/M, z30.s, z1.s\n"
+ "smin z24.s, p2/M, z24.s, z1.s\n"
+ "smin z25.s, p2/M, z25.s, z1.s\n"
+ "smin z26.s, p2/M, z26.s, z1.s\n"
+ "smin z27.s, p2/M, z27.s, z1.s\n"
+ "smax z31.s, p2/M, z31.s, z0.s\n"
+ "smax z20.s, p2/M, z20.s, z0.s\n"
+ "smax z21.s, p2/M, z21.s, z0.s\n"
"uzp1 z31.h, z31.h, z20.h\n"
- "smax z22.s, p2/M, z22.s, z5.s\n"
- "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z22.s, p2/M, z22.s, z0.s\n"
+ "smax z16.s, p2/M, z16.s, z0.s\n"
"uzp1 z20.h, z21.h, z22.h\n"
"uzp1 z31.b, z31.b, z20.b\n"
- "smax z17.s, p2/M, z17.s, z5.s\n"
- "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z0.s\n"
+ "smax z18.s, p2/M, z18.s, z0.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
"st1b { z31.b }, p1, [x27]\n"
- "smax z19.s, p2/M, z19.s, z5.s\n"
- "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z0.s\n"
+ "smax z23.s, p2/M, z23.s, z0.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "smax z28.s, p2/M, z28.s, z5.s\n"
- "smax z29.s, p2/M, z29.s, z5.s\n"
+ "smax z28.s, p2/M, z28.s, z0.s\n"
+ "smax z29.s, p2/M, z29.s, z0.s\n"
"uzp1 z23.h, z23.h, z28.h\n"
- "st1b { z16.b }, p1, [x22]\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
- "smax z24.s, p2/M, z24.s, z5.s\n"
- "uzp1 z28.h, z29.h, z30.h\n"
- "uzp1 z23.b, z23.b, z28.b\n"
- "smax z25.s, p2/M, z25.s, z5.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
+ "st1b { z16.b }, p1, [x23]\n"
+ "smax z30.s, p2/M, z30.s, z0.s\n"
+ "smax z24.s, p2/M, z24.s, z0.s\n"
+ "uzp1 z16.h, z29.h, z30.h\n"
+ "uzp1 z23.b, z23.b, z16.b\n"
+ "smax z25.s, p2/M, z25.s, z0.s\n"
+ "smax z26.s, p2/M, z26.s, z0.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
- "st1b { z23.b }, p1, [x21]\n"
- "smax z27.s, p2/M, z27.s, z5.s\n"
- "uzp1 z25.h, z26.h, z27.h\n"
- "uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x20]\n"
+ "st1b { z23.b }, p1, [x22]\n"
+ "smax z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z16.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z16.b\n"
+ "st1b { z24.b }, p1, [x21]\n"
"addvl x27, x27, #1\n"
"56:" // Height 4: Writeback done
"decw x9, ALL, MUL #4\n"
@@ -1407,7 +1407,6 @@ void sve_hybrid_u8qa_mmla_4x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"58:" // Exit
-
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1415,4 +1414,4 @@ void sve_hybrid_u8qa_mmla_4x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index 901cc6d63e..e9197e8ec5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -39,6 +39,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+void sve_hybrid_u8u32_dot_6x4VL_a64fx( ARGLIST );
class cls_sve_hybrid_u8u32_dot_6x4VL
{
@@ -74,7 +75,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, uint32_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -83,10 +83,11 @@ public:
return { 20.98 };
case CPUModel::V1:
return { 62.19 };
+ case CPUModel::A64FX:
+ return { 91.23 };
}
}
-
if (std::is_same<T, uint8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -95,6 +96,8 @@ public:
return { 22.75, 3.90, 0.47 };
case CPUModel::V1:
return { 48.09, 16.24, 0.83 };
+ case CPUModel::A64FX:
+ return { 101.62, 3.15, 0.42 };
}
}
@@ -103,13 +106,19 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
- cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+ cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A64FX:
+ kernel=sve_hybrid_u8u32_dot_6x4VL_a64fx;
+ break;
+ }
}
};
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
index a7dbef329e..4d0f44982a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
@@ -115,11 +115,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 7f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -135,12 +135,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"8:" // Height 1: Multiply loop: Main loop
"udot z8.s, z6.b, z0.b\n"
"udot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x26, x26, #0x4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z11.s, z7.b, z0.b\n"
+ "udot z10.s, z17.b, z0.b\n"
+ "udot z11.s, z16.b, z0.b\n"
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
@@ -150,12 +150,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"udot z8.s, z6.b, z0.b\n"
"udot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z11.s, z7.b, z0.b\n"
+ "udot z10.s, z17.b, z0.b\n"
+ "udot z11.s, z16.b, z0.b\n"
"addvl x10, x10, #4\n"
"bne 5b\n"
"st1w { z8.s }, p3, [x9]\n"
@@ -183,15 +183,15 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 13f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x20]\n"
+ "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 14f\n"
"13:" // Height 2: no accumulate
"mov z8.s, #0x0\n"
@@ -207,12 +207,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"15:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 16f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 17f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -220,7 +220,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"b 17f\n"
"16:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"17:" // Height 2: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -231,18 +231,18 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"18:" // Height 2: Multiply loop: Main loop
"udot z8.s, z6.b, z0.b\n"
"udot z12.s, z6.b, z1.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x26, x26, #0x4\n"
"udot z9.s, z7.b, z0.b\n"
"udot z13.s, z7.b, z1.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"subs x27, x27, #0x4\n"
"add x25, x25, #0x4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z11.s, z7.b, z0.b\n"
- "udot z15.s, z7.b, z1.b\n"
+ "udot z10.s, z17.b, z0.b\n"
+ "udot z14.s, z17.b, z1.b\n"
+ "udot z11.s, z16.b, z0.b\n"
+ "udot z15.s, z16.b, z1.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
@@ -252,29 +252,29 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"udot z8.s, z6.b, z0.b\n"
"udot z12.s, z6.b, z1.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b\n"
"udot z13.s, z7.b, z1.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
+ "udot z10.s, z17.b, z0.b\n"
+ "udot z14.s, z17.b, z1.b\n"
"addvl x10, x10, #4\n"
- "udot z11.s, z7.b, z0.b\n"
- "udot z15.s, z7.b, z1.b\n"
+ "udot z11.s, z16.b, z0.b\n"
+ "udot z15.s, z16.b, z1.b\n"
"bne 15b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x20]\n"
+ "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
"20:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -295,20 +295,20 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 23f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x23]\n"
- "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x21]\n"
+ "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x20]\n"
+ "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 24f\n"
"23:" // Height 3: no accumulate
"mov z8.s, #0x0\n"
@@ -328,13 +328,13 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"25:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 26f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 27f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -343,8 +343,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"b 27f\n"
"26:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"27:" // Height 3: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -360,21 +360,21 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"subs x27, x27, #0x4\n"
"udot z16.s, z6.b, z2.b\n"
"udot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
"add x24, x24, #0x4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z18.s, z6.b, z2.b\n"
- "udot z11.s, z7.b, z0.b\n"
+ "udot z10.s, z21.b, z0.b\n"
+ "udot z14.s, z21.b, z1.b\n"
+ "udot z18.s, z21.b, z2.b\n"
+ "udot z11.s, z20.b, z0.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
- "udot z15.s, z7.b, z1.b\n"
- "udot z19.s, z7.b, z2.b\n"
+ "udot z15.s, z20.b, z1.b\n"
+ "udot z19.s, z20.b, z2.b\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -386,35 +386,35 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"udot z16.s, z6.b, z2.b\n"
"udot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z18.s, z6.b, z2.b\n"
- "udot z11.s, z7.b, z0.b\n"
- "udot z15.s, z7.b, z1.b\n"
- "udot z19.s, z7.b, z2.b\n"
+ "udot z10.s, z21.b, z0.b\n"
+ "udot z14.s, z21.b, z1.b\n"
+ "udot z18.s, z21.b, z2.b\n"
+ "udot z11.s, z20.b, z0.b\n"
+ "udot z15.s, z20.b, z1.b\n"
+ "udot z19.s, z20.b, z2.b\n"
"bne 25b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x23]\n"
- "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x21]\n"
+ "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x20]\n"
+ "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
"30:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -435,25 +435,25 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 33f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x23]\n"
- "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x22]\n"
- "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x22]\n"
+ "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x21]\n"
+ "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x20]\n"
+ "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 34f\n"
"33:" // Height 4: no accumulate
"mov z8.s, #0x0\n"
@@ -477,14 +477,14 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"35:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 36f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 37f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -494,9 +494,9 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"b 37f\n"
"36:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"37:" // Height 4: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -513,7 +513,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"subs x27, x27, #0x4\n"
"udot z16.s, z6.b, z2.b\n"
"udot z20.s, z6.b, z3.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x4\n"
"udot z9.s, z7.b, z0.b\n"
"udot z13.s, z7.b, z1.b\n"
@@ -521,19 +521,19 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x23, x23, #0x4\n"
"udot z17.s, z7.b, z2.b\n"
"udot z21.s, z7.b, z3.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z18.s, z6.b, z2.b\n"
- "udot z22.s, z6.b, z3.b\n"
+ "udot z10.s, z25.b, z0.b\n"
+ "udot z14.s, z25.b, z1.b\n"
+ "udot z18.s, z25.b, z2.b\n"
+ "udot z22.s, z25.b, z3.b\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
- "udot z11.s, z7.b, z0.b\n"
- "udot z15.s, z7.b, z1.b\n"
+ "udot z11.s, z24.b, z0.b\n"
+ "udot z15.s, z24.b, z1.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
- "udot z19.s, z7.b, z2.b\n"
- "udot z23.s, z7.b, z3.b\n"
+ "udot z19.s, z24.b, z2.b\n"
+ "udot z23.s, z24.b, z3.b\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -545,44 +545,44 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x28, x28, #0x1\n"
"udot z16.s, z6.b, z2.b\n"
"udot z20.s, z6.b, z3.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
"cmp x28, x20\n"
"udot z9.s, z7.b, z0.b\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
"udot z21.s, z7.b, z3.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z18.s, z6.b, z2.b\n"
- "udot z22.s, z6.b, z3.b\n"
- "udot z11.s, z7.b, z0.b\n"
- "udot z15.s, z7.b, z1.b\n"
- "udot z19.s, z7.b, z2.b\n"
- "udot z23.s, z7.b, z3.b\n"
+ "udot z10.s, z25.b, z0.b\n"
+ "udot z14.s, z25.b, z1.b\n"
+ "udot z18.s, z25.b, z2.b\n"
+ "udot z22.s, z25.b, z3.b\n"
+ "udot z11.s, z24.b, z0.b\n"
+ "udot z15.s, z24.b, z1.b\n"
+ "udot z19.s, z24.b, z2.b\n"
+ "udot z23.s, z24.b, z3.b\n"
"bne 35b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"st1w { z8.s }, p3, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x23]\n"
- "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x22]\n"
- "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x22]\n"
+ "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x21]\n"
+ "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x20]\n"
+ "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
"40:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -603,30 +603,30 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"whilelt p0.s, x20, x11\n"
"tbz %x[flags], #0, 43f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p3/Z, [x24]\n"
- "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p3/Z, [x23]\n"
- "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p3/Z, [x22]\n"
- "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
- "ld1w { z24.s }, p3/Z, [x21]\n"
- "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
- "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
- "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p3/Z, [x23]\n"
+ "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x22]\n"
+ "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x21]\n"
+ "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p3/Z, [x20]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
"b 44f\n"
"43:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
@@ -654,15 +654,15 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"45:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 46f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -673,10 +673,10 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"b 47f\n"
"46:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"47:" // Height 5: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -698,29 +698,29 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"add x24, x24, #0x4\n"
"udot z24.s, z6.b, z4.b\n"
"udot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"add x23, x23, #0x4\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
"add x22, x22, #0x4\n"
"udot z21.s, z7.b, z3.b\n"
"udot z25.s, z7.b, z4.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z18.s, z6.b, z2.b\n"
- "udot z22.s, z6.b, z3.b\n"
- "udot z26.s, z6.b, z4.b\n"
- "udot z11.s, z7.b, z0.b\n"
+ "udot z10.s, z29.b, z0.b\n"
+ "udot z14.s, z29.b, z1.b\n"
+ "udot z18.s, z29.b, z2.b\n"
+ "udot z22.s, z29.b, z3.b\n"
+ "udot z26.s, z29.b, z4.b\n"
+ "udot z11.s, z28.b, z0.b\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
"ld1b { z6.b }, p4/Z, [x10]\n"
- "udot z15.s, z7.b, z1.b\n"
- "udot z19.s, z7.b, z2.b\n"
+ "udot z15.s, z28.b, z1.b\n"
+ "udot z19.s, z28.b, z2.b\n"
"ld1rw { z1.s }, p4/Z, [x25]\n"
"ld1rw { z2.s }, p4/Z, [x24]\n"
- "udot z23.s, z7.b, z3.b\n"
- "udot z27.s, z7.b, z4.b\n"
+ "udot z23.s, z28.b, z3.b\n"
+ "udot z27.s, z28.b, z4.b\n"
"ld1rw { z3.s }, p4/Z, [x23]\n"
"ld1rw { z4.s }, p4/Z, [x22]\n"
"ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -735,50 +735,50 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"cmp x28, x20\n"
"udot z24.s, z6.b, z4.b\n"
"udot z9.s, z7.b, z0.b\n"
- "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
"udot z13.s, z7.b, z1.b\n"
"udot z17.s, z7.b, z2.b\n"
"udot z21.s, z7.b, z3.b\n"
"udot z25.s, z7.b, z4.b\n"
- "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b\n"
- "udot z14.s, z6.b, z1.b\n"
- "udot z18.s, z6.b, z2.b\n"
- "udot z22.s, z6.b, z3.b\n"
- "udot z26.s, z6.b, z4.b\n"
- "udot z11.s, z7.b, z0.b\n"
- "udot z15.s, z7.b, z1.b\n"
- "udot z19.s, z7.b, z2.b\n"
- "udot z23.s, z7.b, z3.b\n"
- "udot z27.s, z7.b, z4.b\n"
+ "udot z10.s, z29.b, z0.b\n"
+ "udot z14.s, z29.b, z1.b\n"
+ "udot z18.s, z29.b, z2.b\n"
+ "udot z22.s, z29.b, z3.b\n"
+ "udot z26.s, z29.b, z4.b\n"
+ "udot z11.s, z28.b, z0.b\n"
+ "udot z15.s, z28.b, z1.b\n"
+ "udot z19.s, z28.b, z2.b\n"
+ "udot z23.s, z28.b, z3.b\n"
+ "udot z27.s, z28.b, z4.b\n"
"bne 45b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "st1w { z8.s }, p3, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p3, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p3, [x24]\n"
- "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p3, [x23]\n"
- "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p3, [x22]\n"
- "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p3, [x21]\n"
- "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
- "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
- "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z12.s }, p3, [x23]\n"
+ "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p3, [x22]\n"
+ "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p3, [x21]\n"
+ "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p3, [x20]\n"
+ "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
"50:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -862,16 +862,16 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"55:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 56f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 57f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -883,11 +883,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"b 57f\n"
"56:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"57:" // Height 6: input setup done
"subs x27, x27, #0x4\n"
"ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1022,7 +1022,6 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"62:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1030,4 +1029,4 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index 30a108af7e..7871c0b003 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -115,11 +115,11 @@ void sve_hybrid_u8u32_dot_6x4VL (
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 7f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -132,87 +132,87 @@ void sve_hybrid_u8u32_dot_6x4VL (
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "udot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z10.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z11.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "udot z8.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "udot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z10.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "udot z11.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[2]\n"
+ "udot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "udot z10.s, z17.b, z0.b[2]\n"
+ "udot z11.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[3]\n"
+ "udot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z10.s, z17.b, z0.b[3]\n"
+ "udot z11.s, z16.b, z0.b[3]\n"
"add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10]\n"
+ "udot z8.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z9.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z17.b, z0.b[0]\n"
+ "udot z11.s, z16.b, z0.b[0]\n"
"addvl x10, x10, #4\n"
"ble 10f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[1]\n"
+ "udot z9.s, z16.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z10.s, z17.b, z0.b[1]\n"
+ "udot z11.s, z16.b, z0.b[1]\n"
"addvl x10, x10, #4\n"
"ble 10f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[2]\n"
+ "udot z9.s, z16.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z10.s, z17.b, z0.b[2]\n"
+ "udot z11.s, z16.b, z0.b[2]\n"
"addvl x10, x10, #4\n"
"ble 10f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[3]\n"
+ "udot z9.s, z16.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z17.b, z0.b[3]\n"
+ "udot z11.s, z16.b, z0.b[3]\n"
"addvl x10, x10, #4\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -244,15 +244,15 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 15f\n"
"14:" // Height 2: no accumulate
"mov z8.s, #0x0\n"
@@ -268,12 +268,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -281,146 +281,146 @@ void sve_hybrid_u8u32_dot_6x4VL (
"b 18f\n"
"17:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"18:" // Height 2: input setup done
"cmp x27, #0x10\n"
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z1.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z1.b[0]\n"
+ "udot z12.s, z17.b, z0.b[0]\n"
+ "udot z9.s, z16.b, z1.b[0]\n"
+ "udot z13.s, z16.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z17.b, z1.b[0]\n"
+ "udot z14.s, z17.b, z0.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
"cmp x27, #0x10\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "udot z11.s, z16.b, z1.b[0]\n"
+ "udot z15.s, z16.b, z0.b[0]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
"add x26, x26, #0x10\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z8.s, z17.b, z1.b[1]\n"
+ "udot z12.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
"add x25, x25, #0x10\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "udot z9.s, z16.b, z1.b[1]\n"
+ "udot z13.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z10.s, z17.b, z1.b[1]\n"
+ "udot z14.s, z17.b, z0.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "udot z11.s, z16.b, z1.b[1]\n"
+ "udot z15.s, z16.b, z0.b[1]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "udot z8.s, z17.b, z1.b[2]\n"
+ "udot z12.s, z17.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "udot z9.s, z16.b, z1.b[2]\n"
+ "udot z13.s, z16.b, z0.b[2]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "udot z10.s, z17.b, z1.b[2]\n"
+ "udot z14.s, z17.b, z0.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "udot z11.s, z16.b, z1.b[2]\n"
+ "udot z15.s, z16.b, z0.b[2]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "udot z8.s, z17.b, z1.b[3]\n"
+ "udot z12.s, z17.b, z0.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "udot z9.s, z16.b, z1.b[3]\n"
+ "udot z13.s, z16.b, z0.b[3]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "udot z10.s, z17.b, z1.b[3]\n"
+ "udot z14.s, z17.b, z0.b[3]\n"
+ "udot z11.s, z16.b, z1.b[3]\n"
+ "udot z15.s, z16.b, z0.b[3]\n"
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z0.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"subs x27, x27, #0x4\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[0]\n"
+ "udot z12.s, z17.b, z1.b[0]\n"
+ "udot z9.s, z16.b, z0.b[0]\n"
+ "udot z13.s, z16.b, z1.b[0]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z17.b, z0.b[0]\n"
+ "udot z14.s, z17.b, z1.b[0]\n"
"addvl x10, x10, #4\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z11.s, z16.b, z0.b[0]\n"
+ "udot z15.s, z16.b, z1.b[0]\n"
"ble 21f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[1]\n"
+ "udot z12.s, z17.b, z1.b[1]\n"
+ "udot z9.s, z16.b, z0.b[1]\n"
+ "udot z13.s, z16.b, z1.b[1]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z10.s, z17.b, z0.b[1]\n"
+ "udot z14.s, z17.b, z1.b[1]\n"
"addvl x10, x10, #4\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z11.s, z16.b, z0.b[1]\n"
+ "udot z15.s, z16.b, z1.b[1]\n"
"ble 21f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[2]\n"
+ "udot z12.s, z17.b, z1.b[2]\n"
+ "udot z9.s, z16.b, z0.b[2]\n"
+ "udot z13.s, z16.b, z1.b[2]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z10.s, z17.b, z0.b[2]\n"
+ "udot z14.s, z17.b, z1.b[2]\n"
"addvl x10, x10, #4\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z11.s, z16.b, z0.b[2]\n"
+ "udot z15.s, z16.b, z1.b[2]\n"
"ble 21f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z17.b, z0.b[3]\n"
+ "udot z12.s, z17.b, z1.b[3]\n"
+ "udot z9.s, z16.b, z0.b[3]\n"
+ "udot z13.s, z16.b, z1.b[3]\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z17.b, z0.b[3]\n"
+ "udot z14.s, z17.b, z1.b[3]\n"
"addvl x10, x10, #4\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z11.s, z16.b, z0.b[3]\n"
+ "udot z15.s, z16.b, z1.b[3]\n"
"21:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
+ "add x20, x9, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x20]\n"
+ "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
"22:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -441,20 +441,20 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23]\n"
- "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x20]\n"
+ "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 26f\n"
"25:" // Height 3: no accumulate
"mov z8.s, #0x0\n"
@@ -474,13 +474,13 @@ void sve_hybrid_u8u32_dot_6x4VL (
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 29f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -489,86 +489,86 @@ void sve_hybrid_u8u32_dot_6x4VL (
"b 29f\n"
"28:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"29:" // Height 3: input setup done
"cmp x27, #0x10\n"
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x24]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "udot z8.s, z21.b, z2.b[0]\n"
+ "udot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z16.s, z21.b, z0.b[0]\n"
+ "udot z9.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[0]\n"
+ "udot z17.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"cmp x27, #0x10\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z10.s, z21.b, z2.b[0]\n"
+ "udot z14.s, z21.b, z1.b[0]\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "udot z18.s, z21.b, z0.b[0]\n"
+ "udot z11.s, z20.b, z2.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "udot z15.s, z20.b, z1.b[0]\n"
+ "udot z19.s, z20.b, z0.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "udot z8.s, z21.b, z2.b[1]\n"
+ "udot z12.s, z21.b, z1.b[1]\n"
+ "udot z16.s, z21.b, z0.b[1]\n"
+ "udot z9.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[1]\n"
+ "udot z17.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z10.s, z21.b, z2.b[1]\n"
+ "udot z14.s, z21.b, z1.b[1]\n"
+ "udot z18.s, z21.b, z0.b[1]\n"
+ "udot z11.s, z20.b, z2.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "udot z15.s, z20.b, z1.b[1]\n"
+ "udot z19.s, z20.b, z0.b[1]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "udot z8.s, z21.b, z2.b[2]\n"
+ "udot z12.s, z21.b, z1.b[2]\n"
+ "udot z16.s, z21.b, z0.b[2]\n"
+ "udot z9.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[2]\n"
+ "udot z17.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "udot z10.s, z21.b, z2.b[2]\n"
+ "udot z14.s, z21.b, z1.b[2]\n"
+ "udot z18.s, z21.b, z0.b[2]\n"
+ "udot z11.s, z20.b, z2.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "udot z15.s, z20.b, z1.b[2]\n"
+ "udot z19.s, z20.b, z0.b[2]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "udot z8.s, z21.b, z2.b[3]\n"
+ "udot z12.s, z21.b, z1.b[3]\n"
+ "udot z16.s, z21.b, z0.b[3]\n"
+ "udot z9.s, z20.b, z2.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[3]\n"
+ "udot z17.s, z20.b, z0.b[3]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "udot z10.s, z21.b, z2.b[3]\n"
+ "udot z14.s, z21.b, z1.b[3]\n"
+ "udot z18.s, z21.b, z0.b[3]\n"
+ "udot z11.s, z20.b, z2.b[3]\n"
+ "udot z15.s, z20.b, z1.b[3]\n"
+ "udot z19.s, z20.b, z0.b[3]\n"
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -576,100 +576,100 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z1.b }, p0/Z, [x25]\n"
"subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "udot z8.s, z21.b, z0.b[0]\n"
+ "udot z12.s, z21.b, z1.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z16.s, z21.b, z2.b[0]\n"
+ "udot z9.s, z20.b, z0.b[0]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[0]\n"
+ "udot z17.s, z20.b, z2.b[0]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z10.s, z21.b, z0.b[0]\n"
+ "udot z14.s, z21.b, z1.b[0]\n"
+ "udot z18.s, z21.b, z2.b[0]\n"
+ "udot z11.s, z20.b, z0.b[0]\n"
+ "udot z15.s, z20.b, z1.b[0]\n"
+ "udot z19.s, z20.b, z2.b[0]\n"
"ble 32f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z21.b, z0.b[1]\n"
+ "udot z12.s, z21.b, z1.b[1]\n"
+ "udot z16.s, z21.b, z2.b[1]\n"
+ "udot z9.s, z20.b, z0.b[1]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[1]\n"
+ "udot z17.s, z20.b, z2.b[1]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z10.s, z21.b, z0.b[1]\n"
+ "udot z14.s, z21.b, z1.b[1]\n"
+ "udot z18.s, z21.b, z2.b[1]\n"
+ "udot z11.s, z20.b, z0.b[1]\n"
+ "udot z15.s, z20.b, z1.b[1]\n"
+ "udot z19.s, z20.b, z2.b[1]\n"
"ble 32f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z21.b, z0.b[2]\n"
+ "udot z12.s, z21.b, z1.b[2]\n"
+ "udot z16.s, z21.b, z2.b[2]\n"
+ "udot z9.s, z20.b, z0.b[2]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[2]\n"
+ "udot z17.s, z20.b, z2.b[2]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z10.s, z21.b, z0.b[2]\n"
+ "udot z14.s, z21.b, z1.b[2]\n"
+ "udot z18.s, z21.b, z2.b[2]\n"
+ "udot z11.s, z20.b, z0.b[2]\n"
+ "udot z15.s, z20.b, z1.b[2]\n"
+ "udot z19.s, z20.b, z2.b[2]\n"
"ble 32f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z21.b }, p5/Z, [x10]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z21.b, z0.b[3]\n"
+ "udot z12.s, z21.b, z1.b[3]\n"
+ "udot z16.s, z21.b, z2.b[3]\n"
+ "udot z9.s, z20.b, z0.b[3]\n"
+ "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z20.b, z1.b[3]\n"
+ "udot z17.s, z20.b, z2.b[3]\n"
+ "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z10.s, z21.b, z0.b[3]\n"
+ "udot z14.s, z21.b, z1.b[3]\n"
+ "udot z18.s, z21.b, z2.b[3]\n"
+ "udot z11.s, z20.b, z0.b[3]\n"
+ "udot z15.s, z20.b, z1.b[3]\n"
+ "udot z19.s, z20.b, z2.b[3]\n"
"32:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x21]\n"
+ "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x20]\n"
+ "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
"33:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -690,25 +690,25 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"ld1w { z8.s }, p4/Z, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23]\n"
- "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x21]\n"
+ "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 37f\n"
"36:" // Height 4: no accumulate
"mov z8.s, #0x0\n"
@@ -732,14 +732,14 @@ void sve_hybrid_u8u32_dot_6x4VL (
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -749,105 +749,105 @@ void sve_hybrid_u8u32_dot_6x4VL (
"b 40f\n"
"39:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"40:" // Height 4: input setup done
"cmp x27, #0x10\n"
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z3.b }, p0/Z, [x26]\n"
+ "ld1rqb { z2.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z25.b, z3.b[0]\n"
+ "udot z12.s, z25.b, z2.b[0]\n"
+ "udot z16.s, z25.b, z1.b[0]\n"
+ "udot z20.s, z25.b, z0.b[0]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"add x25, x25, #0x10\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
+ "udot z9.s, z24.b, z3.b[0]\n"
+ "udot z13.s, z24.b, z2.b[0]\n"
"add x24, x24, #0x10\n"
"add x23, x23, #0x10\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "udot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "udot z23.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "udot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "udot z17.s, z24.b, z1.b[0]\n"
+ "udot z21.s, z24.b, z0.b[0]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z25.b, z3.b[0]\n"
+ "udot z14.s, z25.b, z2.b[0]\n"
+ "udot z18.s, z25.b, z1.b[0]\n"
+ "udot z22.s, z25.b, z0.b[0]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "udot z11.s, z24.b, z3.b[0]\n"
+ "udot z15.s, z24.b, z2.b[0]\n"
+ "udot z19.s, z24.b, z1.b[0]\n"
+ "udot z23.s, z24.b, z0.b[0]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "udot z8.s, z25.b, z3.b[1]\n"
+ "udot z12.s, z25.b, z2.b[1]\n"
+ "udot z16.s, z25.b, z1.b[1]\n"
+ "udot z20.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z9.s, z24.b, z3.b[1]\n"
+ "udot z13.s, z24.b, z2.b[1]\n"
+ "udot z17.s, z24.b, z1.b[1]\n"
+ "udot z21.s, z24.b, z0.b[1]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z22.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "udot z23.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "udot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z22.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "udot z23.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "udot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
- "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z10.s, z25.b, z3.b[1]\n"
+ "udot z14.s, z25.b, z2.b[1]\n"
+ "udot z18.s, z25.b, z1.b[1]\n"
+ "udot z22.s, z25.b, z0.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "udot z11.s, z24.b, z3.b[1]\n"
+ "udot z15.s, z24.b, z2.b[1]\n"
+ "udot z19.s, z24.b, z1.b[1]\n"
+ "udot z23.s, z24.b, z0.b[1]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "udot z8.s, z25.b, z3.b[2]\n"
+ "udot z12.s, z25.b, z2.b[2]\n"
+ "udot z16.s, z25.b, z1.b[2]\n"
+ "udot z20.s, z25.b, z0.b[2]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "udot z9.s, z24.b, z3.b[2]\n"
+ "udot z13.s, z24.b, z2.b[2]\n"
+ "udot z17.s, z24.b, z1.b[2]\n"
+ "udot z21.s, z24.b, z0.b[2]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "udot z10.s, z25.b, z3.b[2]\n"
+ "udot z14.s, z25.b, z2.b[2]\n"
+ "udot z18.s, z25.b, z1.b[2]\n"
+ "udot z22.s, z25.b, z0.b[2]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "udot z11.s, z24.b, z3.b[2]\n"
+ "udot z15.s, z24.b, z2.b[2]\n"
+ "udot z19.s, z24.b, z1.b[2]\n"
+ "udot z23.s, z24.b, z0.b[2]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "udot z8.s, z25.b, z3.b[3]\n"
+ "udot z12.s, z25.b, z2.b[3]\n"
+ "udot z16.s, z25.b, z1.b[3]\n"
+ "udot z20.s, z25.b, z0.b[3]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "udot z9.s, z24.b, z3.b[3]\n"
+ "udot z13.s, z24.b, z2.b[3]\n"
+ "udot z17.s, z24.b, z1.b[3]\n"
+ "udot z21.s, z24.b, z0.b[3]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "udot z10.s, z25.b, z3.b[3]\n"
+ "udot z14.s, z25.b, z2.b[3]\n"
+ "udot z18.s, z25.b, z1.b[3]\n"
+ "udot z22.s, z25.b, z0.b[3]\n"
+ "udot z11.s, z24.b, z3.b[3]\n"
+ "udot z15.s, z24.b, z2.b[3]\n"
+ "udot z19.s, z24.b, z1.b[3]\n"
+ "udot z23.s, z24.b, z0.b[3]\n"
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -856,121 +856,121 @@ void sve_hybrid_u8u32_dot_6x4VL (
"subs x27, x27, #0x4\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "udot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z25.b, z0.b[0]\n"
+ "udot z12.s, z25.b, z1.b[0]\n"
+ "udot z16.s, z25.b, z2.b[0]\n"
+ "udot z20.s, z25.b, z3.b[0]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z24.b, z0.b[0]\n"
+ "udot z13.s, z24.b, z1.b[0]\n"
+ "udot z17.s, z24.b, z2.b[0]\n"
+ "udot z21.s, z24.b, z3.b[0]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z10.s, z25.b, z0.b[0]\n"
+ "udot z14.s, z25.b, z1.b[0]\n"
+ "udot z18.s, z25.b, z2.b[0]\n"
+ "udot z22.s, z25.b, z3.b[0]\n"
+ "udot z11.s, z24.b, z0.b[0]\n"
+ "udot z15.s, z24.b, z1.b[0]\n"
+ "udot z19.s, z24.b, z2.b[0]\n"
+ "udot z23.s, z24.b, z3.b[0]\n"
"ble 43f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z25.b, z0.b[1]\n"
+ "udot z12.s, z25.b, z1.b[1]\n"
+ "udot z16.s, z25.b, z2.b[1]\n"
+ "udot z20.s, z25.b, z3.b[1]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "udot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z9.s, z24.b, z0.b[1]\n"
+ "udot z13.s, z24.b, z1.b[1]\n"
+ "udot z17.s, z24.b, z2.b[1]\n"
+ "udot z21.s, z24.b, z3.b[1]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z22.s, z6.b, z3.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z10.s, z25.b, z0.b[1]\n"
+ "udot z14.s, z25.b, z1.b[1]\n"
+ "udot z18.s, z25.b, z2.b[1]\n"
+ "udot z22.s, z25.b, z3.b[1]\n"
+ "udot z11.s, z24.b, z0.b[1]\n"
+ "udot z15.s, z24.b, z1.b[1]\n"
+ "udot z19.s, z24.b, z2.b[1]\n"
+ "udot z23.s, z24.b, z3.b[1]\n"
"ble 43f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z25.b, z0.b[2]\n"
+ "udot z12.s, z25.b, z1.b[2]\n"
+ "udot z16.s, z25.b, z2.b[2]\n"
+ "udot z20.s, z25.b, z3.b[2]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x4\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "udot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z9.s, z24.b, z0.b[2]\n"
+ "udot z13.s, z24.b, z1.b[2]\n"
+ "udot z17.s, z24.b, z2.b[2]\n"
+ "udot z21.s, z24.b, z3.b[2]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z22.s, z6.b, z3.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z10.s, z25.b, z0.b[2]\n"
+ "udot z14.s, z25.b, z1.b[2]\n"
+ "udot z18.s, z25.b, z2.b[2]\n"
+ "udot z22.s, z25.b, z3.b[2]\n"
+ "udot z11.s, z24.b, z0.b[2]\n"
+ "udot z15.s, z24.b, z1.b[2]\n"
+ "udot z19.s, z24.b, z2.b[2]\n"
+ "udot z23.s, z24.b, z3.b[2]\n"
"ble 43f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "udot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z25.b, z0.b[3]\n"
+ "udot z12.s, z25.b, z1.b[3]\n"
+ "udot z16.s, z25.b, z2.b[3]\n"
+ "udot z20.s, z25.b, z3.b[3]\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z24.b, z0.b[3]\n"
+ "udot z13.s, z24.b, z1.b[3]\n"
+ "udot z17.s, z24.b, z2.b[3]\n"
+ "udot z21.s, z24.b, z3.b[3]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
- "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z10.s, z25.b, z0.b[3]\n"
+ "udot z14.s, z25.b, z1.b[3]\n"
+ "udot z18.s, z25.b, z2.b[3]\n"
+ "udot z22.s, z25.b, z3.b[3]\n"
+ "udot z11.s, z24.b, z0.b[3]\n"
+ "udot z15.s, z24.b, z1.b[3]\n"
+ "udot z19.s, z24.b, z2.b[3]\n"
+ "udot z23.s, z24.b, z3.b[3]\n"
"43:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
"st1w { z8.s }, p4, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x22]\n"
- "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x22]\n"
+ "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x21]\n"
+ "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x20]\n"
+ "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
"44:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -991,30 +991,30 @@ void sve_hybrid_u8u32_dot_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z8.s }, p4/Z, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z8.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
"ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x23]\n"
- "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x21]\n"
- "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
"b 48f\n"
"47:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
@@ -1042,15 +1042,15 @@ void sve_hybrid_u8u32_dot_6x4VL (
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1061,124 +1061,124 @@ void sve_hybrid_u8u32_dot_6x4VL (
"b 51f\n"
"50:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"51:" // Height 5: input setup done
"cmp x27, #0x10\n"
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x26]\n"
+ "ld1rqb { z3.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
"ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z20.s, z6.b, z3.b[0]\n"
+ "ld1rqb { z0.b }, p0/Z, [x22]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "udot z8.s, z29.b, z4.b[0]\n"
+ "udot z12.s, z29.b, z3.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z16.s, z29.b, z2.b[0]\n"
+ "udot z20.s, z29.b, z1.b[0]\n"
"add x25, x25, #0x10\n"
- "udot z24.s, z6.b, z4.b[0]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z24.s, z29.b, z0.b[0]\n"
+ "udot z9.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
"add x24, x24, #0x10\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z13.s, z28.b, z3.b[0]\n"
+ "udot z17.s, z28.b, z2.b[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "udot z21.s, z7.b, z3.b[0]\n"
- "udot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "udot z26.s, z6.b, z4.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "udot z23.s, z7.b, z3.b[0]\n"
- "udot z27.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z20.s, z6.b, z3.b[1]\n"
- "udot z24.s, z6.b, z4.b[1]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "udot z21.s, z7.b, z3.b[1]\n"
- "udot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "udot z21.s, z28.b, z1.b[0]\n"
+ "udot z25.s, z28.b, z0.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z29.b, z4.b[0]\n"
+ "udot z14.s, z29.b, z3.b[0]\n"
+ "udot z18.s, z29.b, z2.b[0]\n"
+ "udot z22.s, z29.b, z1.b[0]\n"
+ "udot z26.s, z29.b, z0.b[0]\n"
+ "udot z11.s, z28.b, z4.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "udot z15.s, z28.b, z3.b[0]\n"
+ "udot z19.s, z28.b, z2.b[0]\n"
+ "udot z23.s, z28.b, z1.b[0]\n"
+ "udot z27.s, z28.b, z0.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "udot z8.s, z29.b, z4.b[1]\n"
+ "udot z12.s, z29.b, z3.b[1]\n"
+ "udot z16.s, z29.b, z2.b[1]\n"
+ "udot z20.s, z29.b, z1.b[1]\n"
+ "udot z24.s, z29.b, z0.b[1]\n"
+ "udot z9.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z13.s, z28.b, z3.b[1]\n"
+ "udot z17.s, z28.b, z2.b[1]\n"
+ "udot z21.s, z28.b, z1.b[1]\n"
+ "udot z25.s, z28.b, z0.b[1]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z22.s, z6.b, z3.b[1]\n"
- "udot z26.s, z6.b, z4.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "udot z23.s, z7.b, z3.b[1]\n"
- "udot z27.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z20.s, z6.b, z3.b[2]\n"
- "udot z24.s, z6.b, z4.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "udot z21.s, z7.b, z3.b[2]\n"
- "udot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z22.s, z6.b, z3.b[2]\n"
- "udot z26.s, z6.b, z4.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "udot z23.s, z7.b, z3.b[2]\n"
- "udot z27.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z20.s, z6.b, z3.b[3]\n"
- "udot z24.s, z6.b, z4.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "udot z21.s, z7.b, z3.b[3]\n"
- "udot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[3]\n"
- "udot z26.s, z6.b, z4.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
- "udot z23.s, z7.b, z3.b[3]\n"
- "udot z27.s, z7.b, z4.b[3]\n"
+ "udot z10.s, z29.b, z4.b[1]\n"
+ "udot z14.s, z29.b, z3.b[1]\n"
+ "udot z18.s, z29.b, z2.b[1]\n"
+ "udot z22.s, z29.b, z1.b[1]\n"
+ "udot z26.s, z29.b, z0.b[1]\n"
+ "udot z11.s, z28.b, z4.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "udot z15.s, z28.b, z3.b[1]\n"
+ "udot z19.s, z28.b, z2.b[1]\n"
+ "udot z23.s, z28.b, z1.b[1]\n"
+ "udot z27.s, z28.b, z0.b[1]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "udot z8.s, z29.b, z4.b[2]\n"
+ "udot z12.s, z29.b, z3.b[2]\n"
+ "udot z16.s, z29.b, z2.b[2]\n"
+ "udot z20.s, z29.b, z1.b[2]\n"
+ "udot z24.s, z29.b, z0.b[2]\n"
+ "udot z9.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "udot z13.s, z28.b, z3.b[2]\n"
+ "udot z17.s, z28.b, z2.b[2]\n"
+ "udot z21.s, z28.b, z1.b[2]\n"
+ "udot z25.s, z28.b, z0.b[2]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "udot z10.s, z29.b, z4.b[2]\n"
+ "udot z14.s, z29.b, z3.b[2]\n"
+ "udot z18.s, z29.b, z2.b[2]\n"
+ "udot z22.s, z29.b, z1.b[2]\n"
+ "udot z26.s, z29.b, z0.b[2]\n"
+ "udot z11.s, z28.b, z4.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "udot z15.s, z28.b, z3.b[2]\n"
+ "udot z19.s, z28.b, z2.b[2]\n"
+ "udot z23.s, z28.b, z1.b[2]\n"
+ "udot z27.s, z28.b, z0.b[2]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "udot z8.s, z29.b, z4.b[3]\n"
+ "udot z12.s, z29.b, z3.b[3]\n"
+ "udot z16.s, z29.b, z2.b[3]\n"
+ "udot z20.s, z29.b, z1.b[3]\n"
+ "udot z24.s, z29.b, z0.b[3]\n"
+ "udot z9.s, z28.b, z4.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "udot z13.s, z28.b, z3.b[3]\n"
+ "udot z17.s, z28.b, z2.b[3]\n"
+ "udot z21.s, z28.b, z1.b[3]\n"
+ "udot z25.s, z28.b, z0.b[3]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "udot z10.s, z29.b, z4.b[3]\n"
+ "udot z14.s, z29.b, z3.b[3]\n"
+ "udot z18.s, z29.b, z2.b[3]\n"
+ "udot z22.s, z29.b, z1.b[3]\n"
+ "udot z26.s, z29.b, z0.b[3]\n"
+ "udot z11.s, z28.b, z4.b[3]\n"
+ "udot z15.s, z28.b, z3.b[3]\n"
+ "udot z19.s, z28.b, z2.b[3]\n"
+ "udot z23.s, z28.b, z1.b[3]\n"
+ "udot z27.s, z28.b, z0.b[3]\n"
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -1188,142 +1188,142 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z2.b }, p0/Z, [x24]\n"
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z20.s, z6.b, z3.b[0]\n"
- "udot z24.s, z6.b, z4.b[0]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "udot z21.s, z7.b, z3.b[0]\n"
- "udot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "udot z8.s, z29.b, z0.b[0]\n"
+ "udot z12.s, z29.b, z1.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z16.s, z29.b, z2.b[0]\n"
+ "udot z20.s, z29.b, z3.b[0]\n"
+ "udot z24.s, z29.b, z4.b[0]\n"
+ "udot z9.s, z28.b, z0.b[0]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z28.b, z1.b[0]\n"
+ "udot z17.s, z28.b, z2.b[0]\n"
+ "udot z21.s, z28.b, z3.b[0]\n"
+ "udot z25.s, z28.b, z4.b[0]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "udot z26.s, z6.b, z4.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "udot z23.s, z7.b, z3.b[0]\n"
- "udot z27.s, z7.b, z4.b[0]\n"
+ "udot z10.s, z29.b, z0.b[0]\n"
+ "udot z14.s, z29.b, z1.b[0]\n"
+ "udot z18.s, z29.b, z2.b[0]\n"
+ "udot z22.s, z29.b, z3.b[0]\n"
+ "udot z26.s, z29.b, z4.b[0]\n"
+ "udot z11.s, z28.b, z0.b[0]\n"
+ "udot z15.s, z28.b, z1.b[0]\n"
+ "udot z19.s, z28.b, z2.b[0]\n"
+ "udot z23.s, z28.b, z3.b[0]\n"
+ "udot z27.s, z28.b, z4.b[0]\n"
"ble 54f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z29.b, z0.b[1]\n"
+ "udot z12.s, z29.b, z1.b[1]\n"
+ "udot z16.s, z29.b, z2.b[1]\n"
+ "udot z20.s, z29.b, z3.b[1]\n"
"subs x27, x27, #0x4\n"
- "udot z24.s, z6.b, z4.b[1]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "udot z21.s, z7.b, z3.b[1]\n"
- "udot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z24.s, z29.b, z4.b[1]\n"
+ "udot z9.s, z28.b, z0.b[1]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z28.b, z1.b[1]\n"
+ "udot z17.s, z28.b, z2.b[1]\n"
+ "udot z21.s, z28.b, z3.b[1]\n"
+ "udot z25.s, z28.b, z4.b[1]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z22.s, z6.b, z3.b[1]\n"
- "udot z26.s, z6.b, z4.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "udot z23.s, z7.b, z3.b[1]\n"
- "udot z27.s, z7.b, z4.b[1]\n"
+ "udot z10.s, z29.b, z0.b[1]\n"
+ "udot z14.s, z29.b, z1.b[1]\n"
+ "udot z18.s, z29.b, z2.b[1]\n"
+ "udot z22.s, z29.b, z3.b[1]\n"
+ "udot z26.s, z29.b, z4.b[1]\n"
+ "udot z11.s, z28.b, z0.b[1]\n"
+ "udot z15.s, z28.b, z1.b[1]\n"
+ "udot z19.s, z28.b, z2.b[1]\n"
+ "udot z23.s, z28.b, z3.b[1]\n"
+ "udot z27.s, z28.b, z4.b[1]\n"
"ble 54f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z29.b, z0.b[2]\n"
+ "udot z12.s, z29.b, z1.b[2]\n"
+ "udot z16.s, z29.b, z2.b[2]\n"
+ "udot z20.s, z29.b, z3.b[2]\n"
"subs x27, x27, #0x4\n"
- "udot z24.s, z6.b, z4.b[2]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "udot z21.s, z7.b, z3.b[2]\n"
- "udot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z24.s, z29.b, z4.b[2]\n"
+ "udot z9.s, z28.b, z0.b[2]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z28.b, z1.b[2]\n"
+ "udot z17.s, z28.b, z2.b[2]\n"
+ "udot z21.s, z28.b, z3.b[2]\n"
+ "udot z25.s, z28.b, z4.b[2]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z22.s, z6.b, z3.b[2]\n"
- "udot z26.s, z6.b, z4.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "udot z23.s, z7.b, z3.b[2]\n"
- "udot z27.s, z7.b, z4.b[2]\n"
+ "udot z10.s, z29.b, z0.b[2]\n"
+ "udot z14.s, z29.b, z1.b[2]\n"
+ "udot z18.s, z29.b, z2.b[2]\n"
+ "udot z22.s, z29.b, z3.b[2]\n"
+ "udot z26.s, z29.b, z4.b[2]\n"
+ "udot z11.s, z28.b, z0.b[2]\n"
+ "udot z15.s, z28.b, z1.b[2]\n"
+ "udot z19.s, z28.b, z2.b[2]\n"
+ "udot z23.s, z28.b, z3.b[2]\n"
+ "udot z27.s, z28.b, z4.b[2]\n"
"ble 54f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z20.s, z6.b, z3.b[3]\n"
- "udot z24.s, z6.b, z4.b[3]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "udot z21.s, z7.b, z3.b[3]\n"
- "udot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z29.b }, p5/Z, [x10]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z29.b, z0.b[3]\n"
+ "udot z12.s, z29.b, z1.b[3]\n"
+ "udot z16.s, z29.b, z2.b[3]\n"
+ "udot z20.s, z29.b, z3.b[3]\n"
+ "udot z24.s, z29.b, z4.b[3]\n"
+ "udot z9.s, z28.b, z0.b[3]\n"
+ "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z13.s, z28.b, z1.b[3]\n"
+ "udot z17.s, z28.b, z2.b[3]\n"
+ "udot z21.s, z28.b, z3.b[3]\n"
+ "udot z25.s, z28.b, z4.b[3]\n"
+ "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[3]\n"
- "udot z26.s, z6.b, z4.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
- "udot z23.s, z7.b, z3.b[3]\n"
- "udot z27.s, z7.b, z4.b[3]\n"
+ "udot z10.s, z29.b, z0.b[3]\n"
+ "udot z14.s, z29.b, z1.b[3]\n"
+ "udot z18.s, z29.b, z2.b[3]\n"
+ "udot z22.s, z29.b, z3.b[3]\n"
+ "udot z26.s, z29.b, z4.b[3]\n"
+ "udot z11.s, z28.b, z0.b[3]\n"
+ "udot z15.s, z28.b, z1.b[3]\n"
+ "udot z19.s, z28.b, z2.b[3]\n"
+ "udot z23.s, z28.b, z3.b[3]\n"
+ "udot z27.s, z28.b, z4.b[3]\n"
"54:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "st1w { z8.s }, p4, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "st1w { z8.s }, p4, [x9]\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
"st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
"st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z12.s }, p4, [x24]\n"
- "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z20.s }, p4, [x22]\n"
- "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x21]\n"
- "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
"55:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1407,16 +1407,16 @@ void sve_hybrid_u8u32_dot_6x4VL (
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 62f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1428,143 +1428,143 @@ void sve_hybrid_u8u32_dot_6x4VL (
"b 62f\n"
"61:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"62:" // Height 6: input setup done
"cmp x27, #0x10\n"
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z6.b }, p0/Z, [x25]\n"
"sub x27, x27, #0x10\n"
- "ld1rqb { z2.b }, p0/Z, [x24]\n"
- "ld1rqb { z3.b }, p0/Z, [x23]\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z4.b }, p0/Z, [x23]\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "ld1rqb { z5.b }, p0/Z, [x21]\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z1.b, z7.b[0]\n"
+ "udot z12.s, z1.b, z6.b[0]\n"
+ "udot z16.s, z1.b, z5.b[0]\n"
+ "udot z20.s, z1.b, z4.b[0]\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- "udot z24.s, z6.b, z4.b[0]\n"
- "udot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z24.s, z1.b, z3.b[0]\n"
+ "udot z28.s, z1.b, z2.b[0]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
"add x21, x21, #0x10\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "udot z21.s, z7.b, z3.b[0]\n"
- "udot z25.s, z7.b, z4.b[0]\n"
- "udot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "udot z26.s, z6.b, z4.b[0]\n"
- "udot z30.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "udot z23.s, z7.b, z3.b[0]\n"
- "udot z27.s, z7.b, z4.b[0]\n"
- "udot z31.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z20.s, z6.b, z3.b[1]\n"
- "udot z24.s, z6.b, z4.b[1]\n"
- "udot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "udot z21.s, z7.b, z3.b[1]\n"
- "udot z25.s, z7.b, z4.b[1]\n"
- "udot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "udot z9.s, z0.b, z7.b[0]\n"
+ "udot z13.s, z0.b, z6.b[0]\n"
+ "udot z17.s, z0.b, z5.b[0]\n"
+ "udot z21.s, z0.b, z4.b[0]\n"
+ "udot z25.s, z0.b, z3.b[0]\n"
+ "udot z29.s, z0.b, z2.b[0]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z10.s, z1.b, z7.b[0]\n"
+ "udot z14.s, z1.b, z6.b[0]\n"
+ "udot z18.s, z1.b, z5.b[0]\n"
+ "udot z22.s, z1.b, z4.b[0]\n"
+ "udot z26.s, z1.b, z3.b[0]\n"
+ "udot z30.s, z1.b, z2.b[0]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "udot z11.s, z0.b, z7.b[0]\n"
+ "udot z15.s, z0.b, z6.b[0]\n"
+ "udot z19.s, z0.b, z5.b[0]\n"
+ "udot z23.s, z0.b, z4.b[0]\n"
+ "udot z27.s, z0.b, z3.b[0]\n"
+ "udot z31.s, z0.b, z2.b[0]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "udot z8.s, z1.b, z7.b[1]\n"
+ "udot z12.s, z1.b, z6.b[1]\n"
+ "udot z16.s, z1.b, z5.b[1]\n"
+ "udot z20.s, z1.b, z4.b[1]\n"
+ "udot z24.s, z1.b, z3.b[1]\n"
+ "udot z28.s, z1.b, z2.b[1]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "udot z9.s, z0.b, z7.b[1]\n"
+ "udot z13.s, z0.b, z6.b[1]\n"
+ "udot z17.s, z0.b, z5.b[1]\n"
+ "udot z21.s, z0.b, z4.b[1]\n"
+ "udot z25.s, z0.b, z3.b[1]\n"
+ "udot z29.s, z0.b, z2.b[1]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z22.s, z6.b, z3.b[1]\n"
- "udot z26.s, z6.b, z4.b[1]\n"
- "udot z30.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "udot z23.s, z7.b, z3.b[1]\n"
- "udot z27.s, z7.b, z4.b[1]\n"
- "udot z31.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z20.s, z6.b, z3.b[2]\n"
- "udot z24.s, z6.b, z4.b[2]\n"
- "udot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "udot z21.s, z7.b, z3.b[2]\n"
- "udot z25.s, z7.b, z4.b[2]\n"
- "udot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z22.s, z6.b, z3.b[2]\n"
- "udot z26.s, z6.b, z4.b[2]\n"
- "udot z30.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "udot z23.s, z7.b, z3.b[2]\n"
- "udot z27.s, z7.b, z4.b[2]\n"
- "udot z31.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z20.s, z6.b, z3.b[3]\n"
- "udot z24.s, z6.b, z4.b[3]\n"
- "udot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "udot z21.s, z7.b, z3.b[3]\n"
- "udot z25.s, z7.b, z4.b[3]\n"
- "udot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[3]\n"
- "udot z26.s, z6.b, z4.b[3]\n"
- "udot z30.s, z6.b, z5.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
- "udot z23.s, z7.b, z3.b[3]\n"
- "udot z27.s, z7.b, z4.b[3]\n"
- "udot z31.s, z7.b, z5.b[3]\n"
+ "udot z10.s, z1.b, z7.b[1]\n"
+ "udot z14.s, z1.b, z6.b[1]\n"
+ "udot z18.s, z1.b, z5.b[1]\n"
+ "udot z22.s, z1.b, z4.b[1]\n"
+ "udot z26.s, z1.b, z3.b[1]\n"
+ "udot z30.s, z1.b, z2.b[1]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "udot z11.s, z0.b, z7.b[1]\n"
+ "udot z15.s, z0.b, z6.b[1]\n"
+ "udot z19.s, z0.b, z5.b[1]\n"
+ "udot z23.s, z0.b, z4.b[1]\n"
+ "udot z27.s, z0.b, z3.b[1]\n"
+ "udot z31.s, z0.b, z2.b[1]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ "udot z8.s, z1.b, z7.b[2]\n"
+ "udot z12.s, z1.b, z6.b[2]\n"
+ "udot z16.s, z1.b, z5.b[2]\n"
+ "udot z20.s, z1.b, z4.b[2]\n"
+ "udot z24.s, z1.b, z3.b[2]\n"
+ "udot z28.s, z1.b, z2.b[2]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "udot z9.s, z0.b, z7.b[2]\n"
+ "udot z13.s, z0.b, z6.b[2]\n"
+ "udot z17.s, z0.b, z5.b[2]\n"
+ "udot z21.s, z0.b, z4.b[2]\n"
+ "udot z25.s, z0.b, z3.b[2]\n"
+ "udot z29.s, z0.b, z2.b[2]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ "udot z10.s, z1.b, z7.b[2]\n"
+ "udot z14.s, z1.b, z6.b[2]\n"
+ "udot z18.s, z1.b, z5.b[2]\n"
+ "udot z22.s, z1.b, z4.b[2]\n"
+ "udot z26.s, z1.b, z3.b[2]\n"
+ "udot z30.s, z1.b, z2.b[2]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "udot z11.s, z0.b, z7.b[2]\n"
+ "udot z15.s, z0.b, z6.b[2]\n"
+ "udot z19.s, z0.b, z5.b[2]\n"
+ "udot z23.s, z0.b, z4.b[2]\n"
+ "udot z27.s, z0.b, z3.b[2]\n"
+ "udot z31.s, z0.b, z2.b[2]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ "udot z8.s, z1.b, z7.b[3]\n"
+ "udot z12.s, z1.b, z6.b[3]\n"
+ "udot z16.s, z1.b, z5.b[3]\n"
+ "udot z20.s, z1.b, z4.b[3]\n"
+ "udot z24.s, z1.b, z3.b[3]\n"
+ "udot z28.s, z1.b, z2.b[3]\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "udot z9.s, z0.b, z7.b[3]\n"
+ "udot z13.s, z0.b, z6.b[3]\n"
+ "udot z17.s, z0.b, z5.b[3]\n"
+ "udot z21.s, z0.b, z4.b[3]\n"
+ "udot z25.s, z0.b, z3.b[3]\n"
+ "udot z29.s, z0.b, z2.b[3]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "udot z10.s, z1.b, z7.b[3]\n"
+ "udot z14.s, z1.b, z6.b[3]\n"
+ "udot z18.s, z1.b, z5.b[3]\n"
+ "udot z22.s, z1.b, z4.b[3]\n"
+ "udot z26.s, z1.b, z3.b[3]\n"
+ "udot z30.s, z1.b, z2.b[3]\n"
+ "udot z11.s, z0.b, z7.b[3]\n"
+ "udot z15.s, z0.b, z6.b[3]\n"
+ "udot z19.s, z0.b, z5.b[3]\n"
+ "udot z23.s, z0.b, z4.b[3]\n"
+ "udot z27.s, z0.b, z3.b[3]\n"
+ "udot z31.s, z0.b, z2.b[3]\n"
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
@@ -1575,127 +1575,127 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z3.b }, p0/Z, [x23]\n"
"ld1rqb { z4.b }, p0/Z, [x22]\n"
"ld1rqb { z5.b }, p0/Z, [x21]\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[0]\n"
- "udot z12.s, z6.b, z1.b[0]\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "udot z20.s, z6.b, z3.b[0]\n"
- "udot z24.s, z6.b, z4.b[0]\n"
- "udot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[0]\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "udot z21.s, z7.b, z3.b[0]\n"
- "udot z25.s, z7.b, z4.b[0]\n"
- "udot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z7.b, z0.b[0]\n"
+ "udot z12.s, z7.b, z1.b[0]\n"
+ "udot z16.s, z7.b, z2.b[0]\n"
+ "udot z20.s, z7.b, z3.b[0]\n"
+ "udot z24.s, z7.b, z4.b[0]\n"
+ "udot z28.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[0]\n"
+ "udot z13.s, z6.b, z1.b[0]\n"
+ "udot z17.s, z6.b, z2.b[0]\n"
+ "udot z21.s, z6.b, z3.b[0]\n"
+ "udot z25.s, z6.b, z4.b[0]\n"
+ "udot z29.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[0]\n"
- "udot z14.s, z6.b, z1.b[0]\n"
- "udot z18.s, z6.b, z2.b[0]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "udot z26.s, z6.b, z4.b[0]\n"
- "udot z30.s, z6.b, z5.b[0]\n"
- "udot z11.s, z7.b, z0.b[0]\n"
- "udot z15.s, z7.b, z1.b[0]\n"
- "udot z19.s, z7.b, z2.b[0]\n"
- "udot z23.s, z7.b, z3.b[0]\n"
- "udot z27.s, z7.b, z4.b[0]\n"
- "udot z31.s, z7.b, z5.b[0]\n"
+ "udot z10.s, z7.b, z0.b[0]\n"
+ "udot z14.s, z7.b, z1.b[0]\n"
+ "udot z18.s, z7.b, z2.b[0]\n"
+ "udot z22.s, z7.b, z3.b[0]\n"
+ "udot z26.s, z7.b, z4.b[0]\n"
+ "udot z30.s, z7.b, z5.b[0]\n"
+ "udot z11.s, z6.b, z0.b[0]\n"
+ "udot z15.s, z6.b, z1.b[0]\n"
+ "udot z19.s, z6.b, z2.b[0]\n"
+ "udot z23.s, z6.b, z3.b[0]\n"
+ "udot z27.s, z6.b, z4.b[0]\n"
+ "udot z31.s, z6.b, z5.b[0]\n"
"ble 65f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[1]\n"
- "udot z12.s, z6.b, z1.b[1]\n"
- "udot z16.s, z6.b, z2.b[1]\n"
- "udot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z7.b, z0.b[1]\n"
+ "udot z12.s, z7.b, z1.b[1]\n"
+ "udot z16.s, z7.b, z2.b[1]\n"
+ "udot z20.s, z7.b, z3.b[1]\n"
"subs x27, x27, #0x4\n"
- "udot z24.s, z6.b, z4.b[1]\n"
- "udot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[1]\n"
- "udot z13.s, z7.b, z1.b[1]\n"
- "udot z17.s, z7.b, z2.b[1]\n"
- "udot z21.s, z7.b, z3.b[1]\n"
- "udot z25.s, z7.b, z4.b[1]\n"
- "udot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z24.s, z7.b, z4.b[1]\n"
+ "udot z28.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[1]\n"
+ "udot z13.s, z6.b, z1.b[1]\n"
+ "udot z17.s, z6.b, z2.b[1]\n"
+ "udot z21.s, z6.b, z3.b[1]\n"
+ "udot z25.s, z6.b, z4.b[1]\n"
+ "udot z29.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[1]\n"
- "udot z14.s, z6.b, z1.b[1]\n"
- "udot z18.s, z6.b, z2.b[1]\n"
- "udot z22.s, z6.b, z3.b[1]\n"
- "udot z26.s, z6.b, z4.b[1]\n"
- "udot z30.s, z6.b, z5.b[1]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z15.s, z7.b, z1.b[1]\n"
- "udot z19.s, z7.b, z2.b[1]\n"
- "udot z23.s, z7.b, z3.b[1]\n"
- "udot z27.s, z7.b, z4.b[1]\n"
- "udot z31.s, z7.b, z5.b[1]\n"
+ "udot z10.s, z7.b, z0.b[1]\n"
+ "udot z14.s, z7.b, z1.b[1]\n"
+ "udot z18.s, z7.b, z2.b[1]\n"
+ "udot z22.s, z7.b, z3.b[1]\n"
+ "udot z26.s, z7.b, z4.b[1]\n"
+ "udot z30.s, z7.b, z5.b[1]\n"
+ "udot z11.s, z6.b, z0.b[1]\n"
+ "udot z15.s, z6.b, z1.b[1]\n"
+ "udot z19.s, z6.b, z2.b[1]\n"
+ "udot z23.s, z6.b, z3.b[1]\n"
+ "udot z27.s, z6.b, z4.b[1]\n"
+ "udot z31.s, z6.b, z5.b[1]\n"
"ble 65f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[2]\n"
- "udot z12.s, z6.b, z1.b[2]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z7.b, z0.b[2]\n"
+ "udot z12.s, z7.b, z1.b[2]\n"
+ "udot z16.s, z7.b, z2.b[2]\n"
+ "udot z20.s, z7.b, z3.b[2]\n"
"subs x27, x27, #0x4\n"
- "udot z24.s, z6.b, z4.b[2]\n"
- "udot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[2]\n"
- "udot z13.s, z7.b, z1.b[2]\n"
- "udot z17.s, z7.b, z2.b[2]\n"
- "udot z21.s, z7.b, z3.b[2]\n"
- "udot z25.s, z7.b, z4.b[2]\n"
- "udot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "udot z24.s, z7.b, z4.b[2]\n"
+ "udot z28.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[2]\n"
+ "udot z13.s, z6.b, z1.b[2]\n"
+ "udot z17.s, z6.b, z2.b[2]\n"
+ "udot z21.s, z6.b, z3.b[2]\n"
+ "udot z25.s, z6.b, z4.b[2]\n"
+ "udot z29.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[2]\n"
- "udot z14.s, z6.b, z1.b[2]\n"
- "udot z18.s, z6.b, z2.b[2]\n"
- "udot z22.s, z6.b, z3.b[2]\n"
- "udot z26.s, z6.b, z4.b[2]\n"
- "udot z30.s, z6.b, z5.b[2]\n"
- "udot z11.s, z7.b, z0.b[2]\n"
- "udot z15.s, z7.b, z1.b[2]\n"
- "udot z19.s, z7.b, z2.b[2]\n"
- "udot z23.s, z7.b, z3.b[2]\n"
- "udot z27.s, z7.b, z4.b[2]\n"
- "udot z31.s, z7.b, z5.b[2]\n"
+ "udot z10.s, z7.b, z0.b[2]\n"
+ "udot z14.s, z7.b, z1.b[2]\n"
+ "udot z18.s, z7.b, z2.b[2]\n"
+ "udot z22.s, z7.b, z3.b[2]\n"
+ "udot z26.s, z7.b, z4.b[2]\n"
+ "udot z30.s, z7.b, z5.b[2]\n"
+ "udot z11.s, z6.b, z0.b[2]\n"
+ "udot z15.s, z6.b, z1.b[2]\n"
+ "udot z19.s, z6.b, z2.b[2]\n"
+ "udot z23.s, z6.b, z3.b[2]\n"
+ "udot z27.s, z6.b, z4.b[2]\n"
+ "udot z31.s, z6.b, z5.b[2]\n"
"ble 65f\n"
- "ld1b { z6.b }, p5/Z, [x10]\n"
- "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
- "udot z8.s, z6.b, z0.b[3]\n"
- "udot z12.s, z6.b, z1.b[3]\n"
- "udot z16.s, z6.b, z2.b[3]\n"
- "udot z20.s, z6.b, z3.b[3]\n"
- "udot z24.s, z6.b, z4.b[3]\n"
- "udot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
- "udot z9.s, z7.b, z0.b[3]\n"
- "udot z13.s, z7.b, z1.b[3]\n"
- "udot z17.s, z7.b, z2.b[3]\n"
- "udot z21.s, z7.b, z3.b[3]\n"
- "udot z25.s, z7.b, z4.b[3]\n"
- "udot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x10]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+ "udot z8.s, z7.b, z0.b[3]\n"
+ "udot z12.s, z7.b, z1.b[3]\n"
+ "udot z16.s, z7.b, z2.b[3]\n"
+ "udot z20.s, z7.b, z3.b[3]\n"
+ "udot z24.s, z7.b, z4.b[3]\n"
+ "udot z28.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "udot z9.s, z6.b, z0.b[3]\n"
+ "udot z13.s, z6.b, z1.b[3]\n"
+ "udot z17.s, z6.b, z2.b[3]\n"
+ "udot z21.s, z6.b, z3.b[3]\n"
+ "udot z25.s, z6.b, z4.b[3]\n"
+ "udot z29.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
- "udot z10.s, z6.b, z0.b[3]\n"
- "udot z14.s, z6.b, z1.b[3]\n"
- "udot z18.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[3]\n"
- "udot z26.s, z6.b, z4.b[3]\n"
- "udot z30.s, z6.b, z5.b[3]\n"
- "udot z11.s, z7.b, z0.b[3]\n"
- "udot z15.s, z7.b, z1.b[3]\n"
- "udot z19.s, z7.b, z2.b[3]\n"
- "udot z23.s, z7.b, z3.b[3]\n"
- "udot z27.s, z7.b, z4.b[3]\n"
- "udot z31.s, z7.b, z5.b[3]\n"
+ "udot z10.s, z7.b, z0.b[3]\n"
+ "udot z14.s, z7.b, z1.b[3]\n"
+ "udot z18.s, z7.b, z2.b[3]\n"
+ "udot z22.s, z7.b, z3.b[3]\n"
+ "udot z26.s, z7.b, z4.b[3]\n"
+ "udot z30.s, z7.b, z5.b[3]\n"
+ "udot z11.s, z6.b, z0.b[3]\n"
+ "udot z15.s, z6.b, z1.b[3]\n"
+ "udot z19.s, z6.b, z2.b[3]\n"
+ "udot z23.s, z6.b, z3.b[3]\n"
+ "udot z27.s, z6.b, z4.b[3]\n"
+ "udot z31.s, z6.b, z5.b[3]\n"
"65:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1748,7 +1748,6 @@ void sve_hybrid_u8u32_dot_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1756,4 +1755,4 @@ void sve_hybrid_u8u32_dot_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index c0d089278e..8c6a3dba7d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -74,7 +74,6 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
-
if (std::is_same<T, uint32_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -86,7 +85,6 @@ public:
}
}
-
if (std::is_same<T, uint8_t>::value) {
switch (ci->get_cpu_model()) {
default:
@@ -111,5 +109,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
index 59f33289b4..9269576d90 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
@@ -100,16 +100,16 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"incw x20\n"
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 3f\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 4f\n"
@@ -127,11 +127,11 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 6f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
"cbnz x28, 7f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -143,86 +143,86 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ble 9f\n"
"8:" // Height 1: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45d19a88 // ummla z8.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8c // ummla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45d19a89 // ummla z9.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8d // ummla z13.s, z20.b, z16.b\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45d09a8a // ummla z10.s, z20.b, z16.b\n"
+ ".inst 0x45c79a8e // ummla z14.s, z20.b, z7.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
+ ".inst 0x45d19a8b // ummla z11.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8f // ummla z15.s, z20.b, z16.b\n"
"add x26, x26, #0x10\n"
"bgt 8b\n"
"9:" // Height 1: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"subs x27, x27, #0x8\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
"addvl x10, x10, #8\n"
"ble 10f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19828 // ummla z8.s, z1.b, z17.b\n"
+ ".inst 0x45d0982c // ummla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19829 // ummla z9.s, z1.b, z17.b\n"
+ ".inst 0x45d0982d // ummla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d1982a // ummla z10.s, z1.b, z17.b\n"
+ ".inst 0x45d0982e // ummla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45d1982b // ummla z11.s, z1.b, z17.b\n"
+ ".inst 0x45d0982f // ummla z15.s, z1.b, z16.b\n"
"addvl x10, x10, #8\n"
"10:" // Height 1: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -258,21 +258,21 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 14f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x9, x20, LSL #2\n"
+ "ld1w { z18.s }, p4/Z, [x9]\n"
+ "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "ld1w { z12.s }, p4/Z, [x20]\n"
+ "zip1 z8.d, z18.d, z12.d\n"
+ "zip2 z12.d, z18.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "zip1 z9.d, z2.d, z13.d\n"
+ "zip2 z13.d, z2.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "zip1 z10.d, z17.d, z14.d\n"
+ "zip2 z14.d, z17.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
"b 15f\n"
@@ -290,12 +290,12 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 17f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
"cbnz x28, 18f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -303,95 +303,95 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"b 18f\n"
"17:" // Height 2: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
+ "add x25, x26, x21\n"
"18:" // Height 2: input setup done
"cmp x27, #0x10\n"
"ble 20f\n"
"19:" // Height 2: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqb { z20.b }, p0/Z, [x26]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z20.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ "trn2 z20.d, z20.d, z19.d\n"
+ ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45d19a88 // ummla z8.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8c // ummla z12.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45d19a89 // ummla z9.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8d // ummla z13.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45d19a8a // ummla z10.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8e // ummla z14.s, z20.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
"sub x27, x27, #0x10\n"
"cmp x27, #0x10\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
+ ".inst 0x45d19a8b // ummla z11.s, z20.b, z17.b\n"
+ ".inst 0x45d09a8f // ummla z15.s, z20.b, z16.b\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
"bgt 19b\n"
"20:" // Height 2: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1rqb { z19.b }, p0/Z, [x25]\n"
+ "trn1 z18.d, z1.d, z19.d\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
"subs x27, x27, #0x8\n"
- "trn2 z1.d, z1.d, z2.d\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
+ "trn2 z1.d, z1.d, z19.d\n"
+ ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n"
+ ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n"
"addvl x10, x10, #8\n"
"ble 21f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
+ "ld1b { z17.b }, p5/Z, [x10]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d19828 // ummla z8.s, z1.b, z17.b\n"
+ ".inst 0x45d0982c // ummla z12.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d19829 // ummla z9.s, z1.b, z17.b\n"
+ ".inst 0x45d0982d // ummla z13.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d1982a // ummla z10.s, z1.b, z17.b\n"
+ ".inst 0x45d0982e // ummla z14.s, z1.b, z16.b\n"
+ "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+ "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45d1982b // ummla z11.s, z1.b, z17.b\n"
+ ".inst 0x45d0982f // ummla z15.s, z1.b, z16.b\n"
"addvl x10, x10, #8\n"
"21:" // Height 2: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -399,24 +399,24 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"cmp x28, x20\n"
"bne 16b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "add x20, x9, x20, LSL #2\n"
+ "uzp1 z16.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
+ "uzp1 z17.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "st1w { z7.s }, p4, [x9]\n"
- "uzp1 z13.d, z10.d, z14.d\n"
+ "st1w { z16.s }, p4, [x9]\n"
+ "uzp1 z16.d, z10.d, z14.d\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
- "uzp1 z14.d, z11.d, z15.d\n"
+ "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z2.d, z11.d, z15.d\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
- "st1w { z8.s }, p4, [x24]\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z8.s }, p4, [x20]\n"
+ "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
"22:" // Height 2: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -437,28 +437,28 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 25f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x21]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x20]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
"zip1 z17.d, z18.d, z21.d\n"
@@ -490,13 +490,13 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 28f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
"cbnz x28, 29f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -505,169 +505,169 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"b 29f\n"
"28:" // Height 3: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
"29:" // Height 3: input setup done
"cmp x27, #0x10\n"
"ble 31f\n"
"30:" // Height 3: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "trn1 z27.d, z30.d, z24.d\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z28.d, z29.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99b68 // ummla z8.s, z27.b, z25.b\n"
+ ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89b6c // ummla z12.s, z27.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n"
+ ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z29.d\n"
+ ".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n"
+ ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n"
+ ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
"cmp x27, #0x10\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n"
+ ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n"
+ ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n"
+ ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45d99bc8 // ummla z8.s, z30.b, z25.b\n"
+ ".inst 0x45d99b90 // ummla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
+ ".inst 0x45d89bcc // ummla z12.s, z30.b, z24.b\n"
+ ".inst 0x45d89b94 // ummla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45d99bc9 // ummla z9.s, z30.b, z25.b\n"
+ ".inst 0x45d99b91 // ummla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45d89bcd // ummla z13.s, z30.b, z24.b\n"
+ ".inst 0x45d89b95 // ummla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45d99bca // ummla z10.s, z30.b, z25.b\n"
+ ".inst 0x45d99b92 // ummla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45d89bce // ummla z14.s, z30.b, z24.b\n"
+ ".inst 0x45d89b96 // ummla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45d99bcb // ummla z11.s, z30.b, z25.b\n"
+ ".inst 0x45d99b93 // ummla z19.s, z28.b, z25.b\n"
+ ".inst 0x45d89bcf // ummla z15.s, z30.b, z24.b\n"
+ ".inst 0x45d89b97 // ummla z23.s, z28.b, z24.b\n"
"bgt 30b\n"
"31:" // Height 3: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn1 z27.d, z1.d, z24.d\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "trn1 z26.d, z3.d, z28.d\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99b68 // ummla z8.s, z27.b, z25.b\n"
+ ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89b6c // ummla z12.s, z27.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n"
+ ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n"
+ ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z28.d\n"
+ ".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n"
+ ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n"
+ ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
+ ".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n"
+ ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
+ ".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n"
+ ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
"ble 32f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99828 // ummla z8.s, z1.b, z25.b\n"
+ ".inst 0x45d99870 // ummla z16.s, z3.b, z25.b\n"
+ ".inst 0x45d8982c // ummla z12.s, z1.b, z24.b\n"
+ ".inst 0x45d89874 // ummla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d99829 // ummla z9.s, z1.b, z25.b\n"
+ ".inst 0x45d99871 // ummla z17.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45d8982d // ummla z13.s, z1.b, z24.b\n"
+ ".inst 0x45d89875 // ummla z21.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d9982a // ummla z10.s, z1.b, z25.b\n"
+ ".inst 0x45d99872 // ummla z18.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d8982e // ummla z14.s, z1.b, z24.b\n"
+ ".inst 0x45d89876 // ummla z22.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
+ ".inst 0x45d9982b // ummla z11.s, z1.b, z25.b\n"
+ ".inst 0x45d99873 // ummla z19.s, z3.b, z25.b\n"
+ ".inst 0x45d8982f // ummla z15.s, z1.b, z24.b\n"
+ ".inst 0x45d89877 // ummla z23.s, z3.b, z24.b\n"
"32:" // Height 3: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 27b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "add x21, x9, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
+ "uzp1 z25.d, z8.d, z12.d\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+ "uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
"uzp1 z16.d, z16.d, z20.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"uzp1 z17.d, z17.d, z21.d\n"
"uzp1 z18.d, z18.d, z22.d\n"
- "st1w { z8.s }, p4, [x24]\n"
+ "st1w { z8.s }, p4, [x21]\n"
"uzp1 z19.d, z19.d, z23.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x23]\n"
- "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x20]\n"
+ "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
"33:" // Height 3: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -688,37 +688,37 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 36f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
- "add x22, x23, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x22]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x21]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x20]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
@@ -746,14 +746,14 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 39f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
"cbnz x28, 40f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -763,182 +763,182 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"b 40f\n"
"39:" // Height 4: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
"40:" // Height 4: input setup done
"cmp x27, #0x10\n"
"ble 42f\n"
"41:" // Height 4: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "ld1rqb { z30.b }, p0/Z, [x26]\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z29.d, z30.d, z24.d\n"
+ "ld1rqb { z28.b }, p0/Z, [x24]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z30.d, z30.d, z24.d\n"
+ "trn1 z26.d, z28.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99ba8 // ummla z8.s, z29.b, z25.b\n"
+ ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89bac // ummla z12.s, z29.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d99ba9 // ummla z9.s, z29.b, z25.b\n"
+ ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "trn2 z28.d, z28.d, z27.d\n"
+ ".inst 0x45d89bad // ummla z13.s, z29.b, z24.b\n"
+ ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d99baa // ummla z10.s, z29.b, z25.b\n"
+ ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
"cmp x27, #0x10\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45d89bae // ummla z14.s, z29.b, z24.b\n"
+ ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45d99bab // ummla z11.s, z29.b, z25.b\n"
+ ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
"add x26, x26, #0x10\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45d89baf // ummla z15.s, z29.b, z24.b\n"
+ ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
"add x25, x25, #0x10\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45d99bc8 // ummla z8.s, z30.b, z25.b\n"
+ ".inst 0x45d99b90 // ummla z16.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45d89bcc // ummla z12.s, z30.b, z24.b\n"
+ ".inst 0x45d89b94 // ummla z20.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
"add x23, x23, #0x10\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
+ ".inst 0x45d99bc9 // ummla z9.s, z30.b, z25.b\n"
+ ".inst 0x45d99b91 // ummla z17.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45d89bcd // ummla z13.s, z30.b, z24.b\n"
+ ".inst 0x45d89b95 // ummla z21.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45d99bca // ummla z10.s, z30.b, z25.b\n"
+ ".inst 0x45d99b92 // ummla z18.s, z28.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45d89bce // ummla z14.s, z30.b, z24.b\n"
+ ".inst 0x45d89b96 // ummla z22.s, z28.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45d99bcb // ummla z11.s, z30.b, z25.b\n"
+ ".inst 0x45d99b93 // ummla z19.s, z28.b, z25.b\n"
+ ".inst 0x45d89bcf // ummla z15.s, z30.b, z24.b\n"
+ ".inst 0x45d89b97 // ummla z23.s, z28.b, z24.b\n"
"bgt 41b\n"
"42:" // Height 4: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z24.b }, p0/Z, [x25]\n"
+ "trn1 z28.d, z1.d, z24.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ "ld1rqb { z27.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z24.d\n"
+ "trn1 z26.d, z3.d, z27.d\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99b88 // ummla z8.s, z28.b, z25.b\n"
+ ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n"
+ ".inst 0x45d89b8c // ummla z12.s, z28.b, z24.b\n"
+ ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d99b89 // ummla z9.s, z28.b, z25.b\n"
+ ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45d89b8d // ummla z13.s, z28.b, z24.b\n"
+ ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ "trn2 z3.d, z3.d, z27.d\n"
+ ".inst 0x45d99b8a // ummla z10.s, z28.b, z25.b\n"
+ ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d89b8e // ummla z14.s, z28.b, z24.b\n"
+ ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
+ ".inst 0x45d99b8b // ummla z11.s, z28.b, z25.b\n"
+ ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n"
+ ".inst 0x45d89b8f // ummla z15.s, z28.b, z24.b\n"
+ ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n"
"ble 43f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z25.b }, p5/Z, [x10]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45d99828 // ummla z8.s, z1.b, z25.b\n"
+ ".inst 0x45d99870 // ummla z16.s, z3.b, z25.b\n"
+ ".inst 0x45d8982c // ummla z12.s, z1.b, z24.b\n"
+ ".inst 0x45d89874 // ummla z20.s, z3.b, z24.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45d99829 // ummla z9.s, z1.b, z25.b\n"
+ ".inst 0x45d99871 // ummla z17.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45d8982d // ummla z13.s, z1.b, z24.b\n"
+ ".inst 0x45d89875 // ummla z21.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45d9982a // ummla z10.s, z1.b, z25.b\n"
+ ".inst 0x45d99872 // ummla z18.s, z3.b, z25.b\n"
+ "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45d8982e // ummla z14.s, z1.b, z24.b\n"
+ ".inst 0x45d89876 // ummla z22.s, z3.b, z24.b\n"
+ "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
+ ".inst 0x45d9982b // ummla z11.s, z1.b, z25.b\n"
+ ".inst 0x45d99873 // ummla z19.s, z3.b, z25.b\n"
+ ".inst 0x45d8982f // ummla z15.s, z1.b, z24.b\n"
+ ".inst 0x45d89877 // ummla z23.s, z3.b, z24.b\n"
"43:" // Height 4: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 38b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
- "add x22, x23, x20, LSL #2\n"
+ "add x22, x9, x20, LSL #2\n"
+ "add x21, x22, x20, LSL #2\n"
+ "uzp1 z25.d, z8.d, z12.d\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "uzp1 z24.d, z9.d, z13.d\n"
+ "st1w { z25.s }, p4, [x9]\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z25.d, z10.d, z14.d\n"
+ "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+ "uzp1 z24.d, z11.d, z15.d\n"
+ "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "uzp1 z15.d, z16.d, z20.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "uzp1 z25.d, z16.d, z20.d\n"
+ "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "st1w { z8.s }, p4, [x24]\n"
+ "uzp1 z24.d, z17.d, z21.d\n"
+ "st1w { z8.s }, p4, [x22]\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+ "uzp1 z20.d, z19.d, z23.d\n"
+ "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
- "st1w { z15.s }, p4, [x23]\n"
- "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x22]\n"
- "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z25.s }, p4, [x21]\n"
+ "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x20]\n"
+ "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
"44:" // Height 4: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -959,54 +959,54 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"whilelt p1.s, x20, x11\n"
"tbz %x[flags], #0, 47f\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "ld1w { z19.s }, p4/Z, [x9]\n"
"add x21, x22, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "add x20, x21, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip1 z9.d, z10.d, z13.d\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
- "ld1w { z17.s }, p4/Z, [x23]\n"
- "zip1 z10.d, z11.d, z14.d\n"
- "zip2 z14.d, z11.d, z14.d\n"
- "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "zip1 z8.d, z19.d, z12.d\n"
+ "zip2 z12.d, z19.d, z12.d\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "zip1 z9.d, z17.d, z13.d\n"
+ "zip2 z13.d, z17.d, z13.d\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z17.s }, p4/Z, [x22]\n"
+ "zip1 z10.d, z18.d, z14.d\n"
+ "zip2 z14.d, z18.d, z14.d\n"
+ "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
"zip1 z11.d, z16.d, z15.d\n"
"zip2 z15.d, z16.d, z15.d\n"
- "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
"zip1 z16.d, z17.d, z20.d\n"
"zip2 z20.d, z17.d, z20.d\n"
- "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip1 z17.d, z18.d, z21.d\n"
"zip2 z21.d, z18.d, z21.d\n"
- "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
- "ld1w { z25.s }, p4/Z, [x21]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z25.s }, p4/Z, [x20]\n"
"zip1 z18.d, z19.d, z22.d\n"
"zip2 z22.d, z19.d, z22.d\n"
- "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
"zip1 z19.d, z24.d, z23.d\n"
"zip2 z23.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
"zip1 z24.d, z25.d, z28.d\n"
"zip2 z28.d, z25.d, z28.d\n"
"zip1 z25.d, z26.d, z29.d\n"
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 48f\n"
"47:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
@@ -1038,15 +1038,15 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 50f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
"cbnz x28, 51f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1057,231 +1057,231 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"b 51f\n"
"50:" // Height 5: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
"51:" // Height 5: input setup done
"cmp x27, #0x10\n"
"ble 53f\n"
"52:" // Height 5: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqb { z6.b }, p0/Z, [x26]\n"
+ "ld1rqb { z1.b }, p0/Z, [x25]\n"
+ "ld1rqb { z7.b }, p0/Z, [x24]\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z5.d, z6.d, z1.d\n"
+ "trn2 z6.d, z6.d, z1.d\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "trn1 z3.d, z7.d, z2.d\n"
+ "trn2 z7.d, z7.d, z2.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "trn1 z2.d, z4.d, z0.d\n"
+ "trn2 z4.d, z4.d, z0.d\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45c198a8 // ummla z8.s, z5.b, z1.b\n"
+ ".inst 0x45c19870 // ummla z16.s, z3.b, z1.b\n"
+ ".inst 0x45c19858 // ummla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
+ ".inst 0x45c098ac // ummla z12.s, z5.b, z0.b\n"
+ ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
+ ".inst 0x45c0985c // ummla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45c198a9 // ummla z9.s, z5.b, z1.b\n"
"add x25, x25, #0x10\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45c19871 // ummla z17.s, z3.b, z1.b\n"
+ ".inst 0x45c19859 // ummla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
+ ".inst 0x45c098ad // ummla z13.s, z5.b, z0.b\n"
+ ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45c0985d // ummla z29.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45c198aa // ummla z10.s, z5.b, z1.b\n"
+ ".inst 0x45c19872 // ummla z18.s, z3.b, z1.b\n"
+ ".inst 0x45c1985a // ummla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45c098ae // ummla z14.s, z5.b, z0.b\n"
+ ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
+ ".inst 0x45c0985e // ummla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
- ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
- ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n"
+ ".inst 0x45c198ab // ummla z11.s, z5.b, z1.b\n"
+ ".inst 0x45c19873 // ummla z19.s, z3.b, z1.b\n"
+ ".inst 0x45c1985b // ummla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45c098af // ummla z15.s, z5.b, z0.b\n"
+ ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n"
+ ".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n"
+ ".inst 0x45c198f0 // ummla z16.s, z7.b, z1.b\n"
+ ".inst 0x45c19898 // ummla z24.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n"
+ ".inst 0x45c098f4 // ummla z20.s, z7.b, z0.b\n"
+ ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n"
+ ".inst 0x45c198f1 // ummla z17.s, z7.b, z1.b\n"
+ ".inst 0x45c19899 // ummla z25.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n"
+ ".inst 0x45c098f5 // ummla z21.s, z7.b, z0.b\n"
+ ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n"
+ ".inst 0x45c198f2 // ummla z18.s, z7.b, z1.b\n"
+ ".inst 0x45c1989a // ummla z26.s, z4.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n"
+ ".inst 0x45c098f6 // ummla z22.s, z7.b, z0.b\n"
+ ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n"
+ ".inst 0x45c198f3 // ummla z19.s, z7.b, z1.b\n"
+ ".inst 0x45c1989b // ummla z27.s, z4.b, z1.b\n"
+ ".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n"
+ ".inst 0x45c098f7 // ummla z23.s, z7.b, z0.b\n"
+ ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n"
"bgt 52b\n"
"53:" // Height 5: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
+ "ld1rqb { z4.b }, p0/Z, [x25]\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "trn2 z1.d, z1.d, z2.d\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn1 z7.d, z1.d, z4.d\n"
+ "trn2 z1.d, z1.d, z4.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "trn1 z6.d, z3.d, z2.d\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45c298e8 // ummla z8.s, z7.b, z2.b\n"
+ ".inst 0x45c298d0 // ummla z16.s, z6.b, z2.b\n"
+ ".inst 0x45c29898 // ummla z24.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
+ ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n"
+ ".inst 0x45c098d4 // ummla z20.s, z6.b, z0.b\n"
+ ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45c298e9 // ummla z9.s, z7.b, z2.b\n"
+ ".inst 0x45c298d1 // ummla z17.s, z6.b, z2.b\n"
+ ".inst 0x45c29899 // ummla z25.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n"
+ ".inst 0x45c098d5 // ummla z21.s, z6.b, z0.b\n"
+ ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45c298ea // ummla z10.s, z7.b, z2.b\n"
+ ".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n"
+ ".inst 0x45c2989a // ummla z26.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n"
+ ".inst 0x45c098d6 // ummla z22.s, z6.b, z0.b\n"
+ ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45c298eb // ummla z11.s, z7.b, z2.b\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
- ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n"
+ ".inst 0x45c298d3 // ummla z19.s, z6.b, z2.b\n"
+ ".inst 0x45c2989b // ummla z27.s, z4.b, z2.b\n"
+ ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n"
+ ".inst 0x45c098d7 // ummla z23.s, z6.b, z0.b\n"
+ ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n"
"ble 54f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45c29828 // ummla z8.s, z1.b, z2.b\n"
+ ".inst 0x45c29870 // ummla z16.s, z3.b, z2.b\n"
+ ".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n"
+ ".inst 0x45c0982c // ummla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
+ ".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45c29829 // ummla z9.s, z1.b, z2.b\n"
+ ".inst 0x45c29871 // ummla z17.s, z3.b, z2.b\n"
+ ".inst 0x45c298b9 // ummla z25.s, z5.b, z2.b\n"
+ ".inst 0x45c0982d // ummla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
+ ".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45c2982a // ummla z10.s, z1.b, z2.b\n"
+ ".inst 0x45c29872 // ummla z18.s, z3.b, z2.b\n"
+ ".inst 0x45c298ba // ummla z26.s, z5.b, z2.b\n"
+ ".inst 0x45c0982e // ummla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
+ ".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
- ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n"
+ ".inst 0x45c2982b // ummla z11.s, z1.b, z2.b\n"
+ ".inst 0x45c29873 // ummla z19.s, z3.b, z2.b\n"
+ ".inst 0x45c298bb // ummla z27.s, z5.b, z2.b\n"
+ ".inst 0x45c0982f // ummla z15.s, z1.b, z0.b\n"
+ ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n"
+ ".inst 0x45c098bf // ummla z31.s, z5.b, z0.b\n"
"54:" // Height 5: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
"cmp x28, x20\n"
"bne 49b\n"
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "add x24, x9, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "add x23, x9, x20, LSL #2\n"
"add x22, x23, x20, LSL #2\n"
+ "uzp1 z2.d, z8.d, z12.d\n"
"add x21, x22, x20, LSL #2\n"
+ "add x20, x21, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
- "uzp1 z12.d, z9.d, z13.d\n"
+ "uzp1 z1.d, z9.d, z13.d\n"
"uzp2 z9.d, z9.d, z13.d\n"
- "uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "uzp1 z0.d, z10.d, z14.d\n"
+ "st1w { z2.s }, p4, [x9]\n"
"uzp2 z10.d, z10.d, z14.d\n"
- "uzp1 z14.d, z11.d, z15.d\n"
- "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+ "uzp1 z2.d, z11.d, z15.d\n"
+ "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
"uzp2 z11.d, z11.d, z15.d\n"
- "uzp1 z15.d, z16.d, z20.d\n"
- "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+ "uzp1 z1.d, z16.d, z20.d\n"
+ "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
"uzp2 z16.d, z16.d, z20.d\n"
- "uzp1 z20.d, z17.d, z21.d\n"
- "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+ "uzp1 z0.d, z17.d, z21.d\n"
+ "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
"uzp2 z17.d, z17.d, z21.d\n"
"uzp1 z21.d, z18.d, z22.d\n"
- "st1w { z8.s }, p4, [x24]\n"
+ "st1w { z8.s }, p4, [x23]\n"
"uzp2 z18.d, z18.d, z22.d\n"
- "uzp1 z22.d, z19.d, z23.d\n"
- "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+ "uzp1 z20.d, z19.d, z23.d\n"
+ "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
"uzp2 z19.d, z19.d, z23.d\n"
"uzp1 z24.d, z24.d, z28.d\n"
- "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
"uzp1 z25.d, z25.d, z29.d\n"
"uzp1 z26.d, z26.d, z30.d\n"
- "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
"uzp1 z27.d, z27.d, z31.d\n"
- "st1w { z15.s }, p4, [x23]\n"
- "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
- "st1w { z16.s }, p4, [x22]\n"
- "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
- "st1w { z24.s }, p4, [x21]\n"
- "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z1.s }, p4, [x22]\n"
+ "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x21]\n"
+ "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
"55:" // Height 5: Writeback done
"decw x11, ALL, MUL #4\n"
"cmp x11, XZR\n"
@@ -1307,26 +1307,26 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "ld1w { z9.s }, p4/Z, [x9]\n"
+ "ld1w { z17.s }, p4/Z, [x9]\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
- "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
"add x20, x21, x20, LSL #2\n"
"ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
"ld1w { z12.s }, p4/Z, [x24]\n"
- "zip1 z8.d, z9.d, z12.d\n"
+ "zip1 z8.d, z17.d, z12.d\n"
"ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
"ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
- "zip2 z12.d, z9.d, z12.d\n"
- "zip1 z9.d, z10.d, z13.d\n"
+ "zip2 z12.d, z17.d, z12.d\n"
+ "zip1 z9.d, z18.d, z13.d\n"
"ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
"ld1w { z17.s }, p4/Z, [x23]\n"
- "zip2 z13.d, z10.d, z13.d\n"
- "zip1 z10.d, z11.d, z14.d\n"
+ "zip2 z13.d, z18.d, z13.d\n"
+ "zip1 z10.d, z20.d, z14.d\n"
"ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
"ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
- "zip2 z14.d, z11.d, z14.d\n"
+ "zip2 z14.d, z20.d, z14.d\n"
"zip1 z11.d, z16.d, z15.d\n"
"ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
"ld1w { z20.s }, p4/Z, [x22]\n"
@@ -1344,7 +1344,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
"zip2 z22.d, z19.d, z22.d\n"
"zip1 z19.d, z24.d, z23.d\n"
- "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
"ld1w { z28.s }, p4/Z, [x20]\n"
"zip2 z23.d, z24.d, z23.d\n"
"zip1 z24.d, z25.d, z28.d\n"
@@ -1356,8 +1356,8 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"zip2 z29.d, z26.d, z29.d\n"
"zip1 z26.d, z27.d, z30.d\n"
"zip2 z30.d, z27.d, z30.d\n"
- "zip1 z27.d, z6.d, z31.d\n"
- "zip2 z31.d, z6.d, z31.d\n"
+ "zip1 z27.d, z0.d, z31.d\n"
+ "zip2 z31.d, z0.d, z31.d\n"
"b 59f\n"
"58:" // Height 6: no accumulate
"mov z8.s, #0x0\n"
@@ -1389,16 +1389,16 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr w27, [x20, x28, LSL #0x2]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
"tbz %x[flags], #3, 61f\n"
- "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
- "add x21, x21, x20, LSL #3\n"
- "ldr x26, [x21, #0x0]\n"
- "ldr x25, [x21, #0x8]\n"
- "ldr x24, [x21, #0x10]\n"
- "ldr x23, [x21, #0x18]\n"
- "ldr x22, [x21, #0x20]\n"
- "ldr x21, [x21, #0x28]\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x21, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x25, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x21, [x20, #0x28]\n"
"cbnz x28, 62f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
"add x26, x26, x20\n"
@@ -1410,184 +1410,184 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"b 62f\n"
"61:" // Height 6: setup direct input
"mov x26, %x[input_ptr]\n"
- "add x25, x26, x20\n"
- "add x24, x25, x20\n"
- "add x23, x24, x20\n"
- "add x22, x23, x20\n"
- "add x21, x22, x20\n"
+ "add x25, x26, x21\n"
+ "add x24, x25, x21\n"
+ "add x23, x24, x21\n"
+ "add x22, x23, x21\n"
+ "add x21, x22, x21\n"
"62:" // Height 6: input setup done
"cmp x27, #0x10\n"
"ble 64f\n"
"63:" // Height 6: Multiply loop: Main loop head
"whilelt p0.b, XZR, x27\n"
- "ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
- "ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqb { z7.b }, p0/Z, [x26]\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z6.d, z7.d, z0.d\n"
+ "ld1rqb { z5.b }, p0/Z, [x24]\n"
+ "ld1rqb { z1.b }, p0/Z, [x23]\n"
+ "trn2 z7.d, z7.d, z0.d\n"
+ "trn1 z4.d, z5.d, z1.d\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z5.d, z5.d, z1.d\n"
+ "trn1 z2.d, z3.d, z0.d\n"
+ "trn2 z3.d, z3.d, z0.d\n"
+ "ld1b { z1.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n"
+ ".inst 0x45c19890 // ummla z16.s, z4.b, z1.b\n"
+ ".inst 0x45c19858 // ummla z24.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
"sub x27, x27, #0x10\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
+ ".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n"
+ ".inst 0x45c09894 // ummla z20.s, z4.b, z0.b\n"
"cmp x27, #0x10\n"
"add x26, x26, #0x10\n"
- ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
+ ".inst 0x45c0985c // ummla z28.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n"
"add x25, x25, #0x10\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45c19891 // ummla z17.s, z4.b, z1.b\n"
+ ".inst 0x45c19859 // ummla z25.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
"add x24, x24, #0x10\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
+ ".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n"
+ ".inst 0x45c09895 // ummla z21.s, z4.b, z0.b\n"
"add x23, x23, #0x10\n"
"add x22, x22, #0x10\n"
- ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
+ ".inst 0x45c0985d // ummla z29.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n"
"add x21, x21, #0x10\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45c19892 // ummla z18.s, z4.b, z1.b\n"
+ ".inst 0x45c1985a // ummla z26.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n"
+ ".inst 0x45c09896 // ummla z22.s, z4.b, z0.b\n"
+ ".inst 0x45c0985e // ummla z30.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #16\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
- ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
- ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n"
+ ".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n"
+ ".inst 0x45c19893 // ummla z19.s, z4.b, z1.b\n"
+ ".inst 0x45c1985b // ummla z27.s, z2.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+ ".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n"
+ ".inst 0x45c09897 // ummla z23.s, z4.b, z0.b\n"
+ ".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+ ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n"
+ ".inst 0x45c198b0 // ummla z16.s, z5.b, z1.b\n"
+ ".inst 0x45c19878 // ummla z24.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+ ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n"
+ ".inst 0x45c098b4 // ummla z20.s, z5.b, z0.b\n"
+ ".inst 0x45c0987c // ummla z28.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+ ".inst 0x45c198e9 // ummla z9.s, z7.b, z1.b\n"
+ ".inst 0x45c198b1 // ummla z17.s, z5.b, z1.b\n"
+ ".inst 0x45c19879 // ummla z25.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+ ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n"
+ ".inst 0x45c098b5 // ummla z21.s, z5.b, z0.b\n"
+ ".inst 0x45c0987d // ummla z29.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+ ".inst 0x45c198ea // ummla z10.s, z7.b, z1.b\n"
+ ".inst 0x45c198b2 // ummla z18.s, z5.b, z1.b\n"
+ ".inst 0x45c1987a // ummla z26.s, z3.b, z1.b\n"
+ "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+ ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n"
+ ".inst 0x45c098b6 // ummla z22.s, z5.b, z0.b\n"
+ ".inst 0x45c0987e // ummla z30.s, z3.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+ ".inst 0x45c198eb // ummla z11.s, z7.b, z1.b\n"
+ ".inst 0x45c198b3 // ummla z19.s, z5.b, z1.b\n"
+ ".inst 0x45c1987b // ummla z27.s, z3.b, z1.b\n"
+ ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n"
+ ".inst 0x45c098b7 // ummla z23.s, z5.b, z0.b\n"
+ ".inst 0x45c0987f // ummla z31.s, z3.b, z0.b\n"
"bgt 63b\n"
"64:" // Height 6: Multiply loop: Single iteration only
"whilelt p0.b, XZR, x27\n"
"ld1rqb { z1.b }, p0/Z, [x26]\n"
- "ld1rqb { z2.b }, p0/Z, [x25]\n"
- "trn1 z0.d, z1.d, z2.d\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
+ "trn1 z7.d, z1.d, z0.d\n"
"ld1rqb { z3.b }, p0/Z, [x24]\n"
- "ld1rqb { z4.b }, p0/Z, [x23]\n"
- "trn2 z1.d, z1.d, z2.d\n"
- "trn1 z2.d, z3.d, z4.d\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "trn2 z1.d, z1.d, z0.d\n"
+ "trn1 z6.d, z3.d, z2.d\n"
"ld1rqb { z5.b }, p0/Z, [x22]\n"
- "ld1rqb { z6.b }, p0/Z, [x21]\n"
- "trn2 z3.d, z3.d, z4.d\n"
- "trn1 z4.d, z5.d, z6.d\n"
- "trn2 z5.d, z5.d, z6.d\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n"
- ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n"
- ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x21]\n"
+ "trn2 z3.d, z3.d, z2.d\n"
+ "trn1 z4.d, z5.d, z0.d\n"
+ "trn2 z5.d, z5.d, z0.d\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45c298e8 // ummla z8.s, z7.b, z2.b\n"
+ ".inst 0x45c298d0 // ummla z16.s, z6.b, z2.b\n"
+ ".inst 0x45c29898 // ummla z24.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
"subs x27, x27, #0x8\n"
- ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
- ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n"
- ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n"
- ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n"
- ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
+ ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n"
+ ".inst 0x45c098d4 // ummla z20.s, z6.b, z0.b\n"
+ ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45c298e9 // ummla z9.s, z7.b, z2.b\n"
+ ".inst 0x45c298d1 // ummla z17.s, z6.b, z2.b\n"
+ ".inst 0x45c29899 // ummla z25.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n"
+ ".inst 0x45c098d5 // ummla z21.s, z6.b, z0.b\n"
+ ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45c298ea // ummla z10.s, z7.b, z2.b\n"
+ ".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n"
+ ".inst 0x45c2989a // ummla z26.s, z4.b, z2.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n"
+ ".inst 0x45c098d6 // ummla z22.s, z6.b, z0.b\n"
+ ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+ ".inst 0x45c298eb // ummla z11.s, z7.b, z2.b\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n"
- ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n"
- ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n"
- ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n"
- ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n"
+ ".inst 0x45c298d3 // ummla z19.s, z6.b, z2.b\n"
+ ".inst 0x45c2989b // ummla z27.s, z4.b, z2.b\n"
+ ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n"
+ ".inst 0x45c098d7 // ummla z23.s, z6.b, z0.b\n"
+ ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n"
"ble 65f\n"
- "ld1b { z7.b }, p5/Z, [x10]\n"
- "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
- ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n"
- ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n"
- ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n"
- ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n"
- ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
- ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n"
- ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n"
- ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n"
- ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
- ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n"
- ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
- ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n"
- ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n"
- ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
- ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n"
- ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n"
- "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+ "ld1b { z2.b }, p5/Z, [x10]\n"
+ "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+ ".inst 0x45c29828 // ummla z8.s, z1.b, z2.b\n"
+ ".inst 0x45c29870 // ummla z16.s, z3.b, z2.b\n"
+ ".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n"
+ ".inst 0x45c0982c // ummla z12.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n"
+ ".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+ ".inst 0x45c29829 // ummla z9.s, z1.b, z2.b\n"
+ ".inst 0x45c29871 // ummla z17.s, z3.b, z2.b\n"
+ ".inst 0x45c298b9 // ummla z25.s, z5.b, z2.b\n"
+ ".inst 0x45c0982d // ummla z13.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+ ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n"
+ ".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+ ".inst 0x45c2982a // ummla z10.s, z1.b, z2.b\n"
+ ".inst 0x45c29872 // ummla z18.s, z3.b, z2.b\n"
+ ".inst 0x45c298ba // ummla z26.s, z5.b, z2.b\n"
+ ".inst 0x45c0982e // ummla z14.s, z1.b, z0.b\n"
+ "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+ ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n"
+ ".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n"
+ "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
"addvl x10, x10, #8\n"
- ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n"
- ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n"
- ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n"
- ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n"
+ ".inst 0x45c2982b // ummla z11.s, z1.b, z2.b\n"
+ ".inst 0x45c29873 // ummla z19.s, z3.b, z2.b\n"
+ ".inst 0x45c298bb // ummla z27.s, z5.b, z2.b\n"
+ ".inst 0x45c0982f // ummla z15.s, z1.b, z0.b\n"
+ ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n"
+ ".inst 0x45c098bf // ummla z31.s, z5.b, z0.b\n"
"65:" // Height 6: Multiply loop: multiply skip
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
"add x28, x28, #0x1\n"
@@ -1596,7 +1596,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
"add x24, x9, x20, LSL #2\n"
"add x23, x24, x20, LSL #2\n"
- "uzp1 z7.d, z8.d, z12.d\n"
+ "uzp1 z0.d, z8.d, z12.d\n"
"add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"uzp2 z8.d, z8.d, z12.d\n"
@@ -1604,7 +1604,7 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"add x20, x21, x20, LSL #2\n"
"uzp2 z9.d, z9.d, z13.d\n"
"uzp1 z13.d, z10.d, z14.d\n"
- "st1w { z7.s }, p4, [x9]\n"
+ "st1w { z0.s }, p4, [x9]\n"
"uzp2 z10.d, z10.d, z14.d\n"
"uzp1 z14.d, z11.d, z15.d\n"
"st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1664,7 +1664,6 @@ void sve_hybrid_u8u32_mmla_6x4VL (
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
"b 1b\n"
"68:" // Exit
-
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1672,4 +1671,4 @@ void sve_hybrid_u8u32_mmla_6x4VL (
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index f5fdf993aa..1ae035c614 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return get_vector_length<float>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<float>();
- }
-
static constexpr unsigned int k_unroll()
{
return 2;
@@ -97,5 +92,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 94452929c6..e507bc5551 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_bf16fp32_dot_8x3VL(
- const bfloat16 *Apanel, const bfloat16 *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *Apanel,
+ const bfloat16 *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,10 +89,10 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
"3:" // main loop head
".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n"
".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n"
".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n"
".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n"
"sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
".inst 0x646940d9 // bfdot z25.s, z6.h, z1.h[1]\n"
".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n"
".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p0/Z, [x22, #5, MUL VL]\n"
+ "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
"addvl x22, x22, #6\n"
- ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
- ".inst 0x646a408b // bfdot z11.s, z4.h, z2.h[1]\n"
+ ".inst 0x64634088 // bfdot z8.s, z4.h, z3.h[0]\n"
+ ".inst 0x646b408b // bfdot z11.s, z4.h, z3.h[1]\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x6472408e // bfdot z14.s, z4.h, z2.h[2]\n"
- ".inst 0x647a4091 // bfdot z17.s, z4.h, z2.h[3]\n"
- ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
- ".inst 0x646b4097 // bfdot z23.s, z4.h, z3.h[1]\n"
- ".inst 0x6473409a // bfdot z26.s, z4.h, z3.h[2]\n"
- ".inst 0x647b409d // bfdot z29.s, z4.h, z3.h[3]\n"
+ ".inst 0x6473408e // bfdot z14.s, z4.h, z3.h[2]\n"
+ ".inst 0x647b4091 // bfdot z17.s, z4.h, z3.h[3]\n"
+ ".inst 0x64674094 // bfdot z20.s, z4.h, z7.h[0]\n"
+ ".inst 0x646f4097 // bfdot z23.s, z4.h, z7.h[1]\n"
+ ".inst 0x6477409a // bfdot z26.s, z4.h, z7.h[2]\n"
+ ".inst 0x647f409d // bfdot z29.s, z4.h, z7.h[3]\n"
"ld1h { z4.h }, p0/Z, [x22]\n"
- ".inst 0x646240a9 // bfdot z9.s, z5.h, z2.h[0]\n"
- ".inst 0x646a40ac // bfdot z12.s, z5.h, z2.h[1]\n"
- ".inst 0x647240af // bfdot z15.s, z5.h, z2.h[2]\n"
- ".inst 0x647a40b2 // bfdot z18.s, z5.h, z2.h[3]\n"
- ".inst 0x646340b5 // bfdot z21.s, z5.h, z3.h[0]\n"
- ".inst 0x646b40b8 // bfdot z24.s, z5.h, z3.h[1]\n"
- ".inst 0x647340bb // bfdot z27.s, z5.h, z3.h[2]\n"
- ".inst 0x647b40be // bfdot z30.s, z5.h, z3.h[3]\n"
+ ".inst 0x646340a9 // bfdot z9.s, z5.h, z3.h[0]\n"
+ ".inst 0x646b40ac // bfdot z12.s, z5.h, z3.h[1]\n"
+ ".inst 0x647340af // bfdot z15.s, z5.h, z3.h[2]\n"
+ ".inst 0x647b40b2 // bfdot z18.s, z5.h, z3.h[3]\n"
+ ".inst 0x646740b5 // bfdot z21.s, z5.h, z7.h[0]\n"
+ ".inst 0x646f40b8 // bfdot z24.s, z5.h, z7.h[1]\n"
+ ".inst 0x647740bb // bfdot z27.s, z5.h, z7.h[2]\n"
+ ".inst 0x647f40be // bfdot z30.s, z5.h, z7.h[3]\n"
"ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x646240ca // bfdot z10.s, z6.h, z2.h[0]\n"
- ".inst 0x646a40cd // bfdot z13.s, z6.h, z2.h[1]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646b40d9 // bfdot z25.s, z6.h, z3.h[1]\n"
- ".inst 0x647340dc // bfdot z28.s, z6.h, z3.h[2]\n"
- ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
+ ".inst 0x6463404a // bfdot z10.s, z2.h, z3.h[0]\n"
+ ".inst 0x646b404d // bfdot z13.s, z2.h, z3.h[1]\n"
+ ".inst 0x64734050 // bfdot z16.s, z2.h, z3.h[2]\n"
+ ".inst 0x647b4053 // bfdot z19.s, z2.h, z3.h[3]\n"
+ ".inst 0x64674056 // bfdot z22.s, z2.h, z7.h[0]\n"
+ ".inst 0x646f4059 // bfdot z25.s, z2.h, z7.h[1]\n"
+ ".inst 0x6477405c // bfdot z28.s, z2.h, z7.h[2]\n"
+ ".inst 0x647f405f // bfdot z31.s, z2.h, z7.h[3]\n"
"ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
@@ -174,37 +178,37 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n"
".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
"cbz x20, 5f\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1h { z7.h }, p0/Z, [x22]\n"
- "ld1h { z4.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n"
- "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x647040ee // bfdot z14.s, z7.h, z0.h[2]\n"
- ".inst 0x647840f1 // bfdot z17.s, z7.h, z0.h[3]\n"
- ".inst 0x646140f4 // bfdot z20.s, z7.h, z1.h[0]\n"
+ "ld1h { z2.h }, p0/Z, [x22]\n"
+ "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x64644048 // bfdot z8.s, z2.h, z4.h[0]\n"
+ "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x646c404b // bfdot z11.s, z2.h, z4.h[1]\n"
+ ".inst 0x6474404e // bfdot z14.s, z2.h, z4.h[2]\n"
+ ".inst 0x647c4051 // bfdot z17.s, z2.h, z4.h[3]\n"
+ ".inst 0x64634054 // bfdot z20.s, z2.h, z3.h[0]\n"
"addvl x22, x22, #3\n"
- ".inst 0x646940f7 // bfdot z23.s, z7.h, z1.h[1]\n"
- ".inst 0x647140fa // bfdot z26.s, z7.h, z1.h[2]\n"
- ".inst 0x647940fd // bfdot z29.s, z7.h, z1.h[3]\n"
- ".inst 0x64604089 // bfdot z9.s, z4.h, z0.h[0]\n"
- ".inst 0x6468408c // bfdot z12.s, z4.h, z0.h[1]\n"
- ".inst 0x6470408f // bfdot z15.s, z4.h, z0.h[2]\n"
- ".inst 0x64784092 // bfdot z18.s, z4.h, z0.h[3]\n"
- ".inst 0x64614095 // bfdot z21.s, z4.h, z1.h[0]\n"
- ".inst 0x64694098 // bfdot z24.s, z4.h, z1.h[1]\n"
- ".inst 0x6471409b // bfdot z27.s, z4.h, z1.h[2]\n"
- ".inst 0x6479409e // bfdot z30.s, z4.h, z1.h[3]\n"
- ".inst 0x646040aa // bfdot z10.s, z5.h, z0.h[0]\n"
- ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
- ".inst 0x647040b0 // bfdot z16.s, z5.h, z0.h[2]\n"
- ".inst 0x647840b3 // bfdot z19.s, z5.h, z0.h[3]\n"
- ".inst 0x646140b6 // bfdot z22.s, z5.h, z1.h[0]\n"
- ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
- ".inst 0x647140bc // bfdot z28.s, z5.h, z1.h[2]\n"
- ".inst 0x647940bf // bfdot z31.s, z5.h, z1.h[3]\n"
+ ".inst 0x646b4057 // bfdot z23.s, z2.h, z3.h[1]\n"
+ ".inst 0x6473405a // bfdot z26.s, z2.h, z3.h[2]\n"
+ ".inst 0x647b405d // bfdot z29.s, z2.h, z3.h[3]\n"
+ ".inst 0x64644029 // bfdot z9.s, z1.h, z4.h[0]\n"
+ ".inst 0x646c402c // bfdot z12.s, z1.h, z4.h[1]\n"
+ ".inst 0x6474402f // bfdot z15.s, z1.h, z4.h[2]\n"
+ ".inst 0x647c4032 // bfdot z18.s, z1.h, z4.h[3]\n"
+ ".inst 0x64634035 // bfdot z21.s, z1.h, z3.h[0]\n"
+ ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n"
+ ".inst 0x6473403b // bfdot z27.s, z1.h, z3.h[2]\n"
+ ".inst 0x647b403e // bfdot z30.s, z1.h, z3.h[3]\n"
+ ".inst 0x6464400a // bfdot z10.s, z0.h, z4.h[0]\n"
+ ".inst 0x646c400d // bfdot z13.s, z0.h, z4.h[1]\n"
+ ".inst 0x64744010 // bfdot z16.s, z0.h, z4.h[2]\n"
+ ".inst 0x647c4013 // bfdot z19.s, z0.h, z4.h[3]\n"
+ ".inst 0x64634016 // bfdot z22.s, z0.h, z3.h[0]\n"
+ ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n"
+ ".inst 0x6473401c // bfdot z28.s, z0.h, z3.h[2]\n"
+ ".inst 0x647b401f // bfdot z31.s, z0.h, z3.h[3]\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
@@ -243,4 +247,4 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index 1de8c68494..c5096ff4ba 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../bfloat.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return get_vector_length<float>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<float>();
- }
-
static constexpr unsigned int k_unroll()
{
return 4;
@@ -109,5 +104,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index fe5382db05..ba7185752a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_bf16fp32_mmla_8x3VL(
- const bfloat16 *Apanel, const bfloat16 *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *Apanel,
+ const bfloat16 *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,82 +89,82 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
"mov z31.b, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
- "ld1h { z6.h }, p0/Z, [x22]\n"
+ "ld1h { z7.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
- ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6464e4da // bfmmla z26.s, z6.h, z4.h\n"
+ ".inst 0x6465e4dd // bfmmla z29.s, z6.h, z5.h\n"
+ "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
+ ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
"sub x20, x20, #0x2\n"
- ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
+ ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
"cmp x20, #0x2\n"
- ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n"
- ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n"
- "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
+ ".inst 0x6467e4db // bfmmla z27.s, z6.h, z7.h\n"
+ ".inst 0x6463e4de // bfmmla z30.s, z6.h, z3.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #4, MUL VL]\n"
+ ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
+ ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
- ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
+ ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
+ ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
+ ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
+ ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
"ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
- ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n"
- "ld1h { z4.h }, p0/Z, [x22, #6, MUL VL]\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "ld1h { z5.h }, p0/Z, [x22, #7, MUL VL]\n"
+ ".inst 0x6465e4dc // bfmmla z28.s, z6.h, z5.h\n"
+ ".inst 0x6464e4df // bfmmla z31.s, z6.h, z4.h\n"
+ "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
+ "ld1h { z2.h }, p0/Z, [x22, #6, MUL VL]\n"
+ ".inst 0x6463e408 // bfmmla z8.s, z0.h, z3.h\n"
+ "ld1h { z4.h }, p0/Z, [x22, #7, MUL VL]\n"
"addvl x22, x22, #16\n"
".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
+ ".inst 0x6463e42e // bfmmla z14.s, z1.h, z3.h\n"
".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
- ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
- "ld1h { z6.h }, p0/Z, [x22, #-8, MUL VL]\n"
+ ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b7 // bfmmla z23.s, z5.h, z7.h\n"
+ ".inst 0x6463e4da // bfmmla z26.s, z6.h, z3.h\n"
+ ".inst 0x6467e4dd // bfmmla z29.s, z6.h, z7.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #-8, MUL VL]\n"
"ld1h { z7.h }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n"
- ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n"
- ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n"
- ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n"
- ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n"
- ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n"
- ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n"
- ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n"
+ ".inst 0x6462e409 // bfmmla z9.s, z0.h, z2.h\n"
+ ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
+ ".inst 0x6462e42f // bfmmla z15.s, z1.h, z2.h\n"
+ ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
+ ".inst 0x6462e4b5 // bfmmla z21.s, z5.h, z2.h\n"
+ ".inst 0x6464e4b8 // bfmmla z24.s, z5.h, z4.h\n"
+ ".inst 0x6462e4db // bfmmla z27.s, z6.h, z2.h\n"
+ ".inst 0x6464e4de // bfmmla z30.s, z6.h, z4.h\n"
"ld1h { z4.h }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
+ ".inst 0x6463e40a // bfmmla z10.s, z0.h, z3.h\n"
".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
- ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
+ ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n"
".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
+ ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n"
+ ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n"
"ld1h { z5.h }, p0/Z, [x22, #-5, MUL VL]\n"
- ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
- ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
+ ".inst 0x6463e4dc // bfmmla z28.s, z6.h, z3.h\n"
+ ".inst 0x6467e4df // bfmmla z31.s, z6.h, z7.h\n"
"ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x22, x22, #-4\n"
"bge 3b\n"
"4:" // main loop skip
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
@@ -168,114 +172,114 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
"ld1h { z6.h }, p0/Z, [x22]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6464e4fa // bfmmla z26.s, z7.h, z4.h\n"
+ ".inst 0x6465e4fd // bfmmla z29.s, z7.h, z5.h\n"
+ "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n"
- ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n"
+ ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n"
".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n"
- ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n"
+ ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n"
- ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n"
+ ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n"
"addvl x22, x22, #4\n"
- ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n"
- ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n"
- ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
- ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
- ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
- ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
- ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
- ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
- ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
- ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
+ ".inst 0x6466e4fb // bfmmla z27.s, z7.h, z6.h\n"
+ ".inst 0x6463e4fe // bfmmla z30.s, z7.h, z3.h\n"
+ ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n"
+ ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n"
+ ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n"
+ ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n"
+ ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n"
+ ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n"
+ ".inst 0x6465e4fc // bfmmla z28.s, z7.h, z5.h\n"
+ ".inst 0x6464e4ff // bfmmla z31.s, z7.h, z4.h\n"
"cbz x20, 5f\n"
- "ld1h { z6.h }, p0/Z, [x22]\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
- "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
- ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
- ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
- "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
- ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
- ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
- "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n"
- ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n"
+ "ld1h { z1.h }, p0/Z, [x22]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n"
+ "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1h { z0.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n"
+ "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
+ ".inst 0x6461e4ce // bfmmla z14.s, z6.h, z1.h\n"
+ ".inst 0x6460e4d1 // bfmmla z17.s, z6.h, z0.h\n"
+ ".inst 0x6461e4b4 // bfmmla z20.s, z5.h, z1.h\n"
+ "ld1h { z3.h }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n"
+ ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n"
+ "ld1h { z2.h }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n"
+ "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1h { z0.h }, p0/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x6463e4e9 // bfmmla z9.s, z7.h, z3.h\n"
+ ".inst 0x6462e4ec // bfmmla z12.s, z7.h, z2.h\n"
"addvl x22, x22, #6\n"
- ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n"
- ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n"
+ ".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n"
+ ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n"
- ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n"
- ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n"
- ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n"
- ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
- ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
- ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
- ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
- ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
- ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
- ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
- ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
+ ".inst 0x6463e4b5 // bfmmla z21.s, z5.h, z3.h\n"
+ ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n"
+ ".inst 0x6463e49b // bfmmla z27.s, z4.h, z3.h\n"
+ ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n"
+ ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n"
+ ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n"
+ ".inst 0x6461e4d0 // bfmmla z16.s, z6.h, z1.h\n"
+ ".inst 0x6460e4d3 // bfmmla z19.s, z6.h, z0.h\n"
+ ".inst 0x6461e4b6 // bfmmla z22.s, z5.h, z1.h\n"
+ ".inst 0x6460e4b9 // bfmmla z25.s, z5.h, z0.h\n"
+ ".inst 0x6461e49c // bfmmla z28.s, z4.h, z1.h\n"
+ ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n"
"5:" // multiply loop done
- "uzp1 z4.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "st1w { z4.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z11.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
- "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z12.d, z10.d, z13.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+ "uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z13.d, z14.d, z17.d\n"
+ "uzp1 z0.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
"st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
"subs x23, x23, #0x1\n"
"st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "uzp1 z18.d, z16.d, z19.d\n"
- "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "uzp1 z19.d, z20.d, z23.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
"addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
"uzp1 z23.d, z21.d, z24.d\n"
"uzp2 z21.d, z21.d, z24.d\n"
"st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "uzp1 z24.d, z22.d, z25.d\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
"st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "uzp1 z25.d, z26.d, z29.d\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
"st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "uzp1 z29.d, z27.d, z30.d\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "uzp1 z30.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
"st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
@@ -290,4 +294,4 @@ void sve_interleaved_bf16fp32_mmla_8x3VL(
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 6f1089d517..6c54167763 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return get_vector_length<__fp16>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<__fp16>();
- }
-
static constexpr unsigned int k_unroll()
{
return 1;
@@ -81,6 +76,8 @@ public:
return { 13.84, 2.07, 2.52 };
case CPUModel::V1:
return { 31.90, 5.15, 10.34 };
+ case CPUModel::A64FX:
+ return { 44.34, 3.23, 7.06 };
}
}
@@ -104,5 +101,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
index 9287509889..609277d889 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -28,8 +28,12 @@
namespace arm_gemm {
void sve_interleaved_fp16_mla_8x3VL_a64fx(
- const __fp16 *Apanel, const __fp16 *Bpanel,
- __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+ const __fp16 *Apanel,
+ const __fp16 *Bpanel,
+ __fp16 *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -88,7 +92,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"fmla z9.h, p0/M, z1.h, z3.h\n"
"sub x20, x20, #0x2\n"
"fmla z10.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z11.h, p0/M, z0.h, z4.h\n"
"fmla z12.h, p0/M, z1.h, z4.h\n"
"fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -97,63 +101,63 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"fmla z15.h, p0/M, z1.h, z5.h\n"
"cmp x20, #0x2\n"
"fmla z16.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #12]\n"
"fmla z17.h, p0/M, z0.h, z6.h\n"
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rh { z5.h }, p0/Z, [%x[Apanel], #14]\n"
+ "fmla z20.h, p0/M, z0.h, z7.h\n"
+ "fmla z21.h, p0/M, z1.h, z7.h\n"
+ "fmla z22.h, p0/M, z2.h, z7.h\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
"fmla z23.h, p0/M, z0.h, z4.h\n"
"fmla z24.h, p0/M, z1.h, z4.h\n"
"fmla z25.h, p0/M, z2.h, z4.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
- "fmla z26.h, p0/M, z0.h, z5.h\n"
- "fmla z27.h, p0/M, z1.h, z5.h\n"
- "fmla z28.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p0/Z, [x22, #3, MUL VL]\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
- "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
- "fmla z8.h, p0/M, z0.h, z3.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n"
- "fmla z9.h, p0/M, z1.h, z3.h\n"
- "fmla z10.h, p0/M, z2.h, z3.h\n"
- "fmla z11.h, p0/M, z0.h, z4.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
- "fmla z12.h, p0/M, z1.h, z4.h\n"
- "fmla z13.h, p0/M, z2.h, z4.h\n"
+ "fmla z26.h, p0/M, z0.h, z3.h\n"
+ "fmla z27.h, p0/M, z1.h, z3.h\n"
+ "fmla z28.h, p0/M, z2.h, z3.h\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #20]\n"
+ "fmla z29.h, p0/M, z0.h, z5.h\n"
+ "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "fmla z30.h, p0/M, z1.h, z5.h\n"
+ "fmla z31.h, p0/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1h { z5.h }, p0/Z, [x22, #5, MUL VL]\n"
+ "fmla z8.h, p0/M, z6.h, z7.h\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
+ "fmla z9.h, p0/M, z2.h, z7.h\n"
+ "fmla z10.h, p0/M, z5.h, z7.h\n"
+ "fmla z11.h, p0/M, z6.h, z4.h\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #24]\n"
+ "fmla z12.h, p0/M, z2.h, z4.h\n"
+ "fmla z13.h, p0/M, z5.h, z4.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
- "fmla z14.h, p0/M, z0.h, z5.h\n"
- "fmla z15.h, p0/M, z1.h, z5.h\n"
+ "fmla z14.h, p0/M, z6.h, z3.h\n"
+ "fmla z15.h, p0/M, z2.h, z3.h\n"
"addvl x22, x22, #6\n"
- "fmla z16.h, p0/M, z2.h, z5.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z17.h, p0/M, z0.h, z6.h\n"
- "fmla z18.h, p0/M, z1.h, z6.h\n"
- "fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n"
+ "fmla z16.h, p0/M, z5.h, z3.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z17.h, p0/M, z6.h, z1.h\n"
+ "fmla z18.h, p0/M, z2.h, z1.h\n"
+ "fmla z19.h, p0/M, z5.h, z1.h\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
- "fmla z23.h, p0/M, z0.h, z4.h\n"
+ "fmla z20.h, p0/M, z6.h, z7.h\n"
+ "fmla z21.h, p0/M, z2.h, z7.h\n"
+ "fmla z22.h, p0/M, z5.h, z7.h\n"
+ "fmla z23.h, p0/M, z6.h, z4.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z24.h, p0/M, z1.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
+ "fmla z24.h, p0/M, z2.h, z4.h\n"
+ "fmla z25.h, p0/M, z5.h, z4.h\n"
"ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
- "fmla z26.h, p0/M, z0.h, z5.h\n"
- "fmla z27.h, p0/M, z1.h, z5.h\n"
- "fmla z28.h, p0/M, z2.h, z5.h\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
+ "fmla z26.h, p0/M, z6.h, z0.h\n"
+ "fmla z27.h, p0/M, z2.h, z0.h\n"
+ "fmla z28.h, p0/M, z5.h, z0.h\n"
+ "fmla z29.h, p0/M, z6.h, z1.h\n"
"ld1h { z0.h }, p0/Z, [x22]\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
+ "fmla z30.h, p0/M, z2.h, z1.h\n"
+ "fmla z31.h, p0/M, z5.h, z1.h\n"
"ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
"ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
@@ -164,7 +168,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"fmla z9.h, p0/M, z1.h, z3.h\n"
"addvl x22, x22, #3\n"
"fmla z10.h, p0/M, z2.h, z3.h\n"
- "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
"fmla z11.h, p0/M, z0.h, z4.h\n"
"fmla z12.h, p0/M, z1.h, z4.h\n"
"fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -176,58 +180,58 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"fmla z17.h, p0/M, z0.h, z6.h\n"
"fmla z18.h, p0/M, z1.h, z6.h\n"
"fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
+ "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
+ "fmla z20.h, p0/M, z0.h, z7.h\n"
+ "fmla z21.h, p0/M, z1.h, z7.h\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
+ "fmla z22.h, p0/M, z2.h, z7.h\n"
"fmla z23.h, p0/M, z0.h, z4.h\n"
"fmla z24.h, p0/M, z1.h, z4.h\n"
"fmla z25.h, p0/M, z2.h, z4.h\n"
"fmla z26.h, p0/M, z0.h, z5.h\n"
"fmla z27.h, p0/M, z1.h, z5.h\n"
"fmla z28.h, p0/M, z2.h, z5.h\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
+ "fmla z29.h, p0/M, z0.h, z3.h\n"
+ "fmla z30.h, p0/M, z1.h, z3.h\n"
+ "fmla z31.h, p0/M, z2.h, z3.h\n"
"cbz x20, 5f\n"
- "ld1h { z0.h }, p0/Z, [x22]\n"
- "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z6.h }, p0/Z, [x22]\n"
+ "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z8.h, p0/M, z0.h, z3.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
- "fmla z9.h, p0/M, z1.h, z3.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
- "fmla z10.h, p0/M, z2.h, z3.h\n"
- "fmla z11.h, p0/M, z0.h, z4.h\n"
- "fmla z12.h, p0/M, z1.h, z4.h\n"
- "fmla z13.h, p0/M, z2.h, z4.h\n"
+ "fmla z8.h, p0/M, z6.h, z3.h\n"
+ "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
+ "fmla z9.h, p0/M, z5.h, z3.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
+ "fmla z10.h, p0/M, z4.h, z3.h\n"
+ "fmla z11.h, p0/M, z6.h, z2.h\n"
+ "fmla z12.h, p0/M, z5.h, z2.h\n"
+ "fmla z13.h, p0/M, z4.h, z2.h\n"
"ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
- "fmla z14.h, p0/M, z0.h, z5.h\n"
- "fmla z15.h, p0/M, z1.h, z5.h\n"
- "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
- "fmla z16.h, p0/M, z2.h, z5.h\n"
- "fmla z17.h, p0/M, z0.h, z6.h\n"
- "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
- "fmla z18.h, p0/M, z1.h, z6.h\n"
- "fmla z19.h, p0/M, z2.h, z6.h\n"
- "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
- "fmla z20.h, p0/M, z0.h, z3.h\n"
- "fmla z21.h, p0/M, z1.h, z3.h\n"
+ "fmla z14.h, p0/M, z6.h, z1.h\n"
+ "fmla z15.h, p0/M, z5.h, z1.h\n"
+ "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+ "fmla z16.h, p0/M, z4.h, z1.h\n"
+ "fmla z17.h, p0/M, z6.h, z0.h\n"
+ "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
+ "fmla z18.h, p0/M, z5.h, z0.h\n"
+ "fmla z19.h, p0/M, z4.h, z0.h\n"
+ "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
+ "fmla z20.h, p0/M, z6.h, z3.h\n"
+ "fmla z21.h, p0/M, z5.h, z3.h\n"
"addvl x22, x22, #3\n"
- "fmla z22.h, p0/M, z2.h, z3.h\n"
- "fmla z23.h, p0/M, z0.h, z4.h\n"
+ "fmla z22.h, p0/M, z4.h, z3.h\n"
+ "fmla z23.h, p0/M, z6.h, z2.h\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla z24.h, p0/M, z1.h, z4.h\n"
- "fmla z25.h, p0/M, z2.h, z4.h\n"
- "fmla z26.h, p0/M, z0.h, z5.h\n"
- "fmla z27.h, p0/M, z1.h, z5.h\n"
- "fmla z28.h, p0/M, z2.h, z5.h\n"
- "fmla z29.h, p0/M, z0.h, z6.h\n"
- "fmla z30.h, p0/M, z1.h, z6.h\n"
- "fmla z31.h, p0/M, z2.h, z6.h\n"
+ "fmla z24.h, p0/M, z5.h, z2.h\n"
+ "fmla z25.h, p0/M, z4.h, z2.h\n"
+ "fmla z26.h, p0/M, z6.h, z1.h\n"
+ "fmla z27.h, p0/M, z5.h, z1.h\n"
+ "fmla z28.h, p0/M, z4.h, z1.h\n"
+ "fmla z29.h, p0/M, z6.h, z0.h\n"
+ "fmla z30.h, p0/M, z5.h, z0.h\n"
+ "fmla z31.h, p0/M, z4.h, z0.h\n"
"5:" // multiply loop done
"st1h { z8.h }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
@@ -261,7 +265,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx(
"bne 1b\n"
: [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
: [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index 1ac2ac075e..3b16c97e2c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -28,8 +28,12 @@
namespace arm_gemm {
void sve_interleaved_fp16_mla_8x3VL(
- const __fp16 *Apanel, const __fp16 *Bpanel,
- __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+ const __fp16 *Apanel,
+ const __fp16 *Bpanel,
+ __fp16 *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -83,16 +87,16 @@ void sve_interleaved_fp16_mla_8x3VL(
"3:" // main loop head
"fmla z8.h, z2.h, z0.h[0]\n"
"fmla z11.h, z2.h, z0.h[1]\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
"fmla z14.h, z2.h, z0.h[2]\n"
"fmla z17.h, z2.h, z0.h[3]\n"
- "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
"fmla z20.h, z2.h, z0.h[4]\n"
"fmla z23.h, z2.h, z0.h[5]\n"
- "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1h { z5.h }, p0/Z, [x22, #4, MUL VL]\n"
"fmla z26.h, z2.h, z0.h[6]\n"
"fmla z29.h, z2.h, z0.h[7]\n"
- "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
+ "ld1h { z1.h }, p0/Z, [x22, #5, MUL VL]\n"
"fmla z9.h, z3.h, z0.h[0]\n"
"fmla z12.h, z3.h, z0.h[1]\n"
"addvl x22, x22, #6\n"
@@ -116,31 +120,31 @@ void sve_interleaved_fp16_mla_8x3VL(
"fmla z28.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "fmla z8.h, z5.h, z1.h[0]\n"
- "fmla z11.h, z5.h, z1.h[1]\n"
+ "fmla z8.h, z6.h, z7.h[0]\n"
+ "fmla z11.h, z6.h, z7.h[1]\n"
"ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
- "fmla z14.h, z5.h, z1.h[2]\n"
- "fmla z17.h, z5.h, z1.h[3]\n"
- "fmla z20.h, z5.h, z1.h[4]\n"
- "fmla z23.h, z5.h, z1.h[5]\n"
- "fmla z26.h, z5.h, z1.h[6]\n"
- "fmla z29.h, z5.h, z1.h[7]\n"
- "fmla z9.h, z6.h, z1.h[0]\n"
- "fmla z12.h, z6.h, z1.h[1]\n"
- "fmla z15.h, z6.h, z1.h[2]\n"
- "fmla z18.h, z6.h, z1.h[3]\n"
- "fmla z21.h, z6.h, z1.h[4]\n"
- "fmla z24.h, z6.h, z1.h[5]\n"
- "fmla z27.h, z6.h, z1.h[6]\n"
- "fmla z30.h, z6.h, z1.h[7]\n"
- "fmla z10.h, z7.h, z1.h[0]\n"
- "fmla z13.h, z7.h, z1.h[1]\n"
- "fmla z16.h, z7.h, z1.h[2]\n"
- "fmla z19.h, z7.h, z1.h[3]\n"
- "fmla z22.h, z7.h, z1.h[4]\n"
- "fmla z25.h, z7.h, z1.h[5]\n"
- "fmla z28.h, z7.h, z1.h[6]\n"
- "fmla z31.h, z7.h, z1.h[7]\n"
+ "fmla z14.h, z6.h, z7.h[2]\n"
+ "fmla z17.h, z6.h, z7.h[3]\n"
+ "fmla z20.h, z6.h, z7.h[4]\n"
+ "fmla z23.h, z6.h, z7.h[5]\n"
+ "fmla z26.h, z6.h, z7.h[6]\n"
+ "fmla z29.h, z6.h, z7.h[7]\n"
+ "fmla z9.h, z5.h, z7.h[0]\n"
+ "fmla z12.h, z5.h, z7.h[1]\n"
+ "fmla z15.h, z5.h, z7.h[2]\n"
+ "fmla z18.h, z5.h, z7.h[3]\n"
+ "fmla z21.h, z5.h, z7.h[4]\n"
+ "fmla z24.h, z5.h, z7.h[5]\n"
+ "fmla z27.h, z5.h, z7.h[6]\n"
+ "fmla z30.h, z5.h, z7.h[7]\n"
+ "fmla z10.h, z1.h, z7.h[0]\n"
+ "fmla z13.h, z1.h, z7.h[1]\n"
+ "fmla z16.h, z1.h, z7.h[2]\n"
+ "fmla z19.h, z1.h, z7.h[3]\n"
+ "fmla z22.h, z1.h, z7.h[4]\n"
+ "fmla z25.h, z1.h, z7.h[5]\n"
+ "fmla z28.h, z1.h, z7.h[6]\n"
+ "fmla z31.h, z1.h, z7.h[7]\n"
"bge 3b\n"
"4:" // main loop skip
"fmla z8.h, z2.h, z0.h[0]\n"
@@ -170,36 +174,36 @@ void sve_interleaved_fp16_mla_8x3VL(
"fmla z28.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"cbz x20, 5f\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "ld1h { z5.h }, p0/Z, [x22]\n"
- "fmla z8.h, z5.h, z0.h[0]\n"
- "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1h { z7.h }, p0/Z, [x22, #2, MUL VL]\n"
- "fmla z11.h, z5.h, z0.h[1]\n"
- "fmla z14.h, z5.h, z0.h[2]\n"
- "fmla z17.h, z5.h, z0.h[3]\n"
+ "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1h { z2.h }, p0/Z, [x22]\n"
+ "fmla z8.h, z2.h, z3.h[0]\n"
+ "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+ "fmla z11.h, z2.h, z3.h[1]\n"
+ "fmla z14.h, z2.h, z3.h[2]\n"
+ "fmla z17.h, z2.h, z3.h[3]\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla z20.h, z5.h, z0.h[4]\n"
- "fmla z23.h, z5.h, z0.h[5]\n"
+ "fmla z20.h, z2.h, z3.h[4]\n"
+ "fmla z23.h, z2.h, z3.h[5]\n"
"addvl x22, x22, #3\n"
- "fmla z26.h, z5.h, z0.h[6]\n"
- "fmla z29.h, z5.h, z0.h[7]\n"
- "fmla z9.h, z6.h, z0.h[0]\n"
- "fmla z12.h, z6.h, z0.h[1]\n"
- "fmla z15.h, z6.h, z0.h[2]\n"
- "fmla z18.h, z6.h, z0.h[3]\n"
- "fmla z21.h, z6.h, z0.h[4]\n"
- "fmla z24.h, z6.h, z0.h[5]\n"
- "fmla z27.h, z6.h, z0.h[6]\n"
- "fmla z30.h, z6.h, z0.h[7]\n"
- "fmla z10.h, z7.h, z0.h[0]\n"
- "fmla z13.h, z7.h, z0.h[1]\n"
- "fmla z16.h, z7.h, z0.h[2]\n"
- "fmla z19.h, z7.h, z0.h[3]\n"
- "fmla z22.h, z7.h, z0.h[4]\n"
- "fmla z25.h, z7.h, z0.h[5]\n"
- "fmla z28.h, z7.h, z0.h[6]\n"
- "fmla z31.h, z7.h, z0.h[7]\n"
+ "fmla z26.h, z2.h, z3.h[6]\n"
+ "fmla z29.h, z2.h, z3.h[7]\n"
+ "fmla z9.h, z1.h, z3.h[0]\n"
+ "fmla z12.h, z1.h, z3.h[1]\n"
+ "fmla z15.h, z1.h, z3.h[2]\n"
+ "fmla z18.h, z1.h, z3.h[3]\n"
+ "fmla z21.h, z1.h, z3.h[4]\n"
+ "fmla z24.h, z1.h, z3.h[5]\n"
+ "fmla z27.h, z1.h, z3.h[6]\n"
+ "fmla z30.h, z1.h, z3.h[7]\n"
+ "fmla z10.h, z0.h, z3.h[0]\n"
+ "fmla z13.h, z0.h, z3.h[1]\n"
+ "fmla z16.h, z0.h, z3.h[2]\n"
+ "fmla z19.h, z0.h, z3.h[3]\n"
+ "fmla z22.h, z0.h, z3.h[4]\n"
+ "fmla z25.h, z0.h, z3.h[5]\n"
+ "fmla z28.h, z0.h, z3.h[6]\n"
+ "fmla z31.h, z0.h, z3.h[7]\n"
"5:" // multiply loop done
"st1h { z8.h }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index 29b928ee3b..23ab7ce10a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@ public:
return get_vector_length<float>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<float>();
- }
-
static constexpr unsigned int k_unroll()
{
return 1;
@@ -75,10 +70,14 @@ public:
if (std::is_same<T, float>::value) {
switch (ci->get_cpu_model()) {
- case CPUModel::V1:
- return { 15.15, 9.24, 6.42 };
default:
return { 7.2307, 3.876, 2.932 };
+ case CPUModel::A64FX:
+ return { 26.52, 3.42, 4.59 };
+ case CPUModel::A510:
+ return { 6.25, 3.84, 2.47 };
+ case CPUModel::V1:
+ return { 15.15, 9.24, 6.42 };
}
}
@@ -102,5 +101,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
index 3141a258a8..0b13913717 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -28,8 +28,12 @@
namespace arm_gemm {
void sve_interleaved_fp32_mla_8x3VL_a64fx(
- const float *Apanel, const float *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *Apanel,
+ const float *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -88,7 +92,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"fmla z9.s, p0/M, z1.s, z3.s\n"
"sub x20, x20, #0x2\n"
"fmla z10.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z11.s, p0/M, z0.s, z4.s\n"
"fmla z12.s, p0/M, z1.s, z4.s\n"
"fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -97,63 +101,63 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"fmla z15.s, p0/M, z1.s, z5.s\n"
"cmp x20, #0x2\n"
"fmla z16.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
"fmla z17.s, p0/M, z0.s, z6.s\n"
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z20.s, p0/M, z0.s, z7.s\n"
+ "fmla z21.s, p0/M, z1.s, z7.s\n"
+ "fmla z22.s, p0/M, z2.s, z7.s\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
"fmla z23.s, p0/M, z0.s, z4.s\n"
"fmla z24.s, p0/M, z1.s, z4.s\n"
"fmla z25.s, p0/M, z2.s, z4.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "fmla z26.s, p0/M, z0.s, z5.s\n"
- "fmla z27.s, p0/M, z1.s, z5.s\n"
- "fmla z28.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p0/Z, [x22, #3, MUL VL]\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
- "ld1w { z1.s }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
- "fmla z8.s, p0/M, z0.s, z3.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
- "fmla z9.s, p0/M, z1.s, z3.s\n"
- "fmla z10.s, p0/M, z2.s, z3.s\n"
- "fmla z11.s, p0/M, z0.s, z4.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
- "fmla z12.s, p0/M, z1.s, z4.s\n"
- "fmla z13.s, p0/M, z2.s, z4.s\n"
+ "fmla z26.s, p0/M, z0.s, z3.s\n"
+ "fmla z27.s, p0/M, z1.s, z3.s\n"
+ "fmla z28.s, p0/M, z2.s, z3.s\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+ "fmla z29.s, p0/M, z0.s, z5.s\n"
+ "ld1w { z6.s }, p0/Z, [x22, #3, MUL VL]\n"
+ "fmla z30.s, p0/M, z1.s, z5.s\n"
+ "fmla z31.s, p0/M, z2.s, z5.s\n"
+ "ld1w { z2.s }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z5.s }, p0/Z, [x22, #5, MUL VL]\n"
+ "fmla z8.s, p0/M, z6.s, z7.s\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+ "fmla z9.s, p0/M, z2.s, z7.s\n"
+ "fmla z10.s, p0/M, z5.s, z7.s\n"
+ "fmla z11.s, p0/M, z6.s, z4.s\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+ "fmla z12.s, p0/M, z2.s, z4.s\n"
+ "fmla z13.s, p0/M, z5.s, z4.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "fmla z14.s, p0/M, z0.s, z5.s\n"
- "fmla z15.s, p0/M, z1.s, z5.s\n"
+ "fmla z14.s, p0/M, z6.s, z3.s\n"
+ "fmla z15.s, p0/M, z2.s, z3.s\n"
"addvl x22, x22, #6\n"
- "fmla z16.s, p0/M, z2.s, z5.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
- "fmla z17.s, p0/M, z0.s, z6.s\n"
- "fmla z18.s, p0/M, z1.s, z6.s\n"
- "fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+ "fmla z16.s, p0/M, z5.s, z3.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+ "fmla z17.s, p0/M, z6.s, z1.s\n"
+ "fmla z18.s, p0/M, z2.s, z1.s\n"
+ "fmla z19.s, p0/M, z5.s, z1.s\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
- "fmla z23.s, p0/M, z0.s, z4.s\n"
+ "fmla z20.s, p0/M, z6.s, z7.s\n"
+ "fmla z21.s, p0/M, z2.s, z7.s\n"
+ "fmla z22.s, p0/M, z5.s, z7.s\n"
+ "fmla z23.s, p0/M, z6.s, z4.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "fmla z24.s, p0/M, z1.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
+ "fmla z24.s, p0/M, z2.s, z4.s\n"
+ "fmla z25.s, p0/M, z5.s, z4.s\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "fmla z26.s, p0/M, z0.s, z5.s\n"
- "fmla z27.s, p0/M, z1.s, z5.s\n"
- "fmla z28.s, p0/M, z2.s, z5.s\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
+ "fmla z26.s, p0/M, z6.s, z0.s\n"
+ "fmla z27.s, p0/M, z2.s, z0.s\n"
+ "fmla z28.s, p0/M, z5.s, z0.s\n"
+ "fmla z29.s, p0/M, z6.s, z1.s\n"
"ld1w { z0.s }, p0/Z, [x22]\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
+ "fmla z30.s, p0/M, z2.s, z1.s\n"
+ "fmla z31.s, p0/M, z5.s, z1.s\n"
"ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
"ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -164,7 +168,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"fmla z9.s, p0/M, z1.s, z3.s\n"
"addvl x22, x22, #3\n"
"fmla z10.s, p0/M, z2.s, z3.s\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"fmla z11.s, p0/M, z0.s, z4.s\n"
"fmla z12.s, p0/M, z1.s, z4.s\n"
"fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -176,58 +180,58 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"fmla z17.s, p0/M, z0.s, z6.s\n"
"fmla z18.s, p0/M, z1.s, z6.s\n"
"fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z20.s, p0/M, z0.s, z7.s\n"
+ "fmla z21.s, p0/M, z1.s, z7.s\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
+ "fmla z22.s, p0/M, z2.s, z7.s\n"
"fmla z23.s, p0/M, z0.s, z4.s\n"
"fmla z24.s, p0/M, z1.s, z4.s\n"
"fmla z25.s, p0/M, z2.s, z4.s\n"
"fmla z26.s, p0/M, z0.s, z5.s\n"
"fmla z27.s, p0/M, z1.s, z5.s\n"
"fmla z28.s, p0/M, z2.s, z5.s\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
+ "fmla z29.s, p0/M, z0.s, z3.s\n"
+ "fmla z30.s, p0/M, z1.s, z3.s\n"
+ "fmla z31.s, p0/M, z2.s, z3.s\n"
"cbz x20, 5f\n"
- "ld1w { z0.s }, p0/Z, [x22]\n"
- "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z4.s }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "fmla z8.s, p0/M, z0.s, z3.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
- "fmla z9.s, p0/M, z1.s, z3.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
- "fmla z10.s, p0/M, z2.s, z3.s\n"
- "fmla z11.s, p0/M, z0.s, z4.s\n"
- "fmla z12.s, p0/M, z1.s, z4.s\n"
- "fmla z13.s, p0/M, z2.s, z4.s\n"
+ "fmla z8.s, p0/M, z6.s, z3.s\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+ "fmla z9.s, p0/M, z5.s, z3.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+ "fmla z10.s, p0/M, z4.s, z3.s\n"
+ "fmla z11.s, p0/M, z6.s, z2.s\n"
+ "fmla z12.s, p0/M, z5.s, z2.s\n"
+ "fmla z13.s, p0/M, z4.s, z2.s\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
- "fmla z14.s, p0/M, z0.s, z5.s\n"
- "fmla z15.s, p0/M, z1.s, z5.s\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
- "fmla z16.s, p0/M, z2.s, z5.s\n"
- "fmla z17.s, p0/M, z0.s, z6.s\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
- "fmla z18.s, p0/M, z1.s, z6.s\n"
- "fmla z19.s, p0/M, z2.s, z6.s\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "fmla z20.s, p0/M, z0.s, z3.s\n"
- "fmla z21.s, p0/M, z1.s, z3.s\n"
+ "fmla z14.s, p0/M, z6.s, z1.s\n"
+ "fmla z15.s, p0/M, z5.s, z1.s\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "fmla z16.s, p0/M, z4.s, z1.s\n"
+ "fmla z17.s, p0/M, z6.s, z0.s\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+ "fmla z18.s, p0/M, z5.s, z0.s\n"
+ "fmla z19.s, p0/M, z4.s, z0.s\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+ "fmla z20.s, p0/M, z6.s, z3.s\n"
+ "fmla z21.s, p0/M, z5.s, z3.s\n"
"addvl x22, x22, #3\n"
- "fmla z22.s, p0/M, z2.s, z3.s\n"
- "fmla z23.s, p0/M, z0.s, z4.s\n"
+ "fmla z22.s, p0/M, z4.s, z3.s\n"
+ "fmla z23.s, p0/M, z6.s, z2.s\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla z24.s, p0/M, z1.s, z4.s\n"
- "fmla z25.s, p0/M, z2.s, z4.s\n"
- "fmla z26.s, p0/M, z0.s, z5.s\n"
- "fmla z27.s, p0/M, z1.s, z5.s\n"
- "fmla z28.s, p0/M, z2.s, z5.s\n"
- "fmla z29.s, p0/M, z0.s, z6.s\n"
- "fmla z30.s, p0/M, z1.s, z6.s\n"
- "fmla z31.s, p0/M, z2.s, z6.s\n"
+ "fmla z24.s, p0/M, z5.s, z2.s\n"
+ "fmla z25.s, p0/M, z4.s, z2.s\n"
+ "fmla z26.s, p0/M, z6.s, z1.s\n"
+ "fmla z27.s, p0/M, z5.s, z1.s\n"
+ "fmla z28.s, p0/M, z4.s, z1.s\n"
+ "fmla z29.s, p0/M, z6.s, z0.s\n"
+ "fmla z30.s, p0/M, z5.s, z0.s\n"
+ "fmla z31.s, p0/M, z4.s, z0.s\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
@@ -261,7 +265,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx(
"bne 1b\n"
: [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
: [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index 9d1c0c3728..c7f32ff7a9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -28,8 +28,12 @@
namespace arm_gemm {
void sve_interleaved_fp32_mla_8x3VL(
- const float *Apanel, const float *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *Apanel,
+ const float *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -84,10 +88,10 @@ void sve_interleaved_fp32_mla_8x3VL(
"3:" // main loop head
"fmla z8.s, z4.s, z0.s[0]\n"
"fmla z11.s, z4.s, z0.s[1]\n"
- "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
"fmla z14.s, z4.s, z0.s[2]\n"
"fmla z17.s, z4.s, z0.s[3]\n"
- "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
"fmla z20.s, z4.s, z1.s[0]\n"
"fmla z23.s, z4.s, z1.s[1]\n"
"sub x20, x20, #0x2\n"
@@ -114,35 +118,35 @@ void sve_interleaved_fp32_mla_8x3VL(
"fmla z25.s, z6.s, z1.s[1]\n"
"fmla z28.s, z6.s, z1.s[2]\n"
"fmla z31.s, z6.s, z1.s[3]\n"
- "ld1w { z6.s }, p0/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
"addvl x22, x22, #6\n"
- "fmla z8.s, z4.s, z2.s[0]\n"
- "fmla z11.s, z4.s, z2.s[1]\n"
+ "fmla z8.s, z4.s, z3.s[0]\n"
+ "fmla z11.s, z4.s, z3.s[1]\n"
"ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
- "fmla z14.s, z4.s, z2.s[2]\n"
- "fmla z17.s, z4.s, z2.s[3]\n"
- "fmla z20.s, z4.s, z3.s[0]\n"
- "fmla z23.s, z4.s, z3.s[1]\n"
- "fmla z26.s, z4.s, z3.s[2]\n"
- "fmla z29.s, z4.s, z3.s[3]\n"
+ "fmla z14.s, z4.s, z3.s[2]\n"
+ "fmla z17.s, z4.s, z3.s[3]\n"
+ "fmla z20.s, z4.s, z7.s[0]\n"
+ "fmla z23.s, z4.s, z7.s[1]\n"
+ "fmla z26.s, z4.s, z7.s[2]\n"
+ "fmla z29.s, z4.s, z7.s[3]\n"
"ld1w { z4.s }, p0/Z, [x22]\n"
- "fmla z9.s, z5.s, z2.s[0]\n"
- "fmla z12.s, z5.s, z2.s[1]\n"
- "fmla z15.s, z5.s, z2.s[2]\n"
- "fmla z18.s, z5.s, z2.s[3]\n"
- "fmla z21.s, z5.s, z3.s[0]\n"
- "fmla z24.s, z5.s, z3.s[1]\n"
- "fmla z27.s, z5.s, z3.s[2]\n"
- "fmla z30.s, z5.s, z3.s[3]\n"
+ "fmla z9.s, z5.s, z3.s[0]\n"
+ "fmla z12.s, z5.s, z3.s[1]\n"
+ "fmla z15.s, z5.s, z3.s[2]\n"
+ "fmla z18.s, z5.s, z3.s[3]\n"
+ "fmla z21.s, z5.s, z7.s[0]\n"
+ "fmla z24.s, z5.s, z7.s[1]\n"
+ "fmla z27.s, z5.s, z7.s[2]\n"
+ "fmla z30.s, z5.s, z7.s[3]\n"
"ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
- "fmla z10.s, z6.s, z2.s[0]\n"
- "fmla z13.s, z6.s, z2.s[1]\n"
- "fmla z16.s, z6.s, z2.s[2]\n"
- "fmla z19.s, z6.s, z2.s[3]\n"
- "fmla z22.s, z6.s, z3.s[0]\n"
- "fmla z25.s, z6.s, z3.s[1]\n"
- "fmla z28.s, z6.s, z3.s[2]\n"
- "fmla z31.s, z6.s, z3.s[3]\n"
+ "fmla z10.s, z2.s, z3.s[0]\n"
+ "fmla z13.s, z2.s, z3.s[1]\n"
+ "fmla z16.s, z2.s, z3.s[2]\n"
+ "fmla z19.s, z2.s, z3.s[3]\n"
+ "fmla z22.s, z2.s, z7.s[0]\n"
+ "fmla z25.s, z2.s, z7.s[1]\n"
+ "fmla z28.s, z2.s, z7.s[2]\n"
+ "fmla z31.s, z2.s, z7.s[3]\n"
"ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
@@ -173,37 +177,37 @@ void sve_interleaved_fp32_mla_8x3VL(
"fmla z28.s, z6.s, z1.s[2]\n"
"fmla z31.s, z6.s, z1.s[3]\n"
"cbz x20, 5f\n"
- "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
- "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1w { z7.s }, p0/Z, [x22]\n"
- "ld1w { z4.s }, p0/Z, [x22, #1, MUL VL]\n"
- "fmla z8.s, z7.s, z0.s[0]\n"
- "ld1w { z5.s }, p0/Z, [x22, #2, MUL VL]\n"
- "fmla z11.s, z7.s, z0.s[1]\n"
- "fmla z14.s, z7.s, z0.s[2]\n"
- "fmla z17.s, z7.s, z0.s[3]\n"
- "fmla z20.s, z7.s, z1.s[0]\n"
+ "ld1w { z2.s }, p0/Z, [x22]\n"
+ "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+ "fmla z8.s, z2.s, z4.s[0]\n"
+ "ld1w { z0.s }, p0/Z, [x22, #2, MUL VL]\n"
+ "fmla z11.s, z2.s, z4.s[1]\n"
+ "fmla z14.s, z2.s, z4.s[2]\n"
+ "fmla z17.s, z2.s, z4.s[3]\n"
+ "fmla z20.s, z2.s, z3.s[0]\n"
"addvl x22, x22, #3\n"
- "fmla z23.s, z7.s, z1.s[1]\n"
- "fmla z26.s, z7.s, z1.s[2]\n"
- "fmla z29.s, z7.s, z1.s[3]\n"
- "fmla z9.s, z4.s, z0.s[0]\n"
- "fmla z12.s, z4.s, z0.s[1]\n"
- "fmla z15.s, z4.s, z0.s[2]\n"
- "fmla z18.s, z4.s, z0.s[3]\n"
- "fmla z21.s, z4.s, z1.s[0]\n"
- "fmla z24.s, z4.s, z1.s[1]\n"
- "fmla z27.s, z4.s, z1.s[2]\n"
- "fmla z30.s, z4.s, z1.s[3]\n"
- "fmla z10.s, z5.s, z0.s[0]\n"
- "fmla z13.s, z5.s, z0.s[1]\n"
- "fmla z16.s, z5.s, z0.s[2]\n"
- "fmla z19.s, z5.s, z0.s[3]\n"
- "fmla z22.s, z5.s, z1.s[0]\n"
- "fmla z25.s, z5.s, z1.s[1]\n"
- "fmla z28.s, z5.s, z1.s[2]\n"
- "fmla z31.s, z5.s, z1.s[3]\n"
+ "fmla z23.s, z2.s, z3.s[1]\n"
+ "fmla z26.s, z2.s, z3.s[2]\n"
+ "fmla z29.s, z2.s, z3.s[3]\n"
+ "fmla z9.s, z1.s, z4.s[0]\n"
+ "fmla z12.s, z1.s, z4.s[1]\n"
+ "fmla z15.s, z1.s, z4.s[2]\n"
+ "fmla z18.s, z1.s, z4.s[3]\n"
+ "fmla z21.s, z1.s, z3.s[0]\n"
+ "fmla z24.s, z1.s, z3.s[1]\n"
+ "fmla z27.s, z1.s, z3.s[2]\n"
+ "fmla z30.s, z1.s, z3.s[3]\n"
+ "fmla z10.s, z0.s, z4.s[0]\n"
+ "fmla z13.s, z0.s, z4.s[1]\n"
+ "fmla z16.s, z0.s, z4.s[2]\n"
+ "fmla z19.s, z0.s, z4.s[3]\n"
+ "fmla z22.s, z0.s, z3.s[0]\n"
+ "fmla z25.s, z0.s, z3.s[1]\n"
+ "fmla z28.s, z0.s, z3.s[2]\n"
+ "fmla z31.s, z0.s, z3.s[3]\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 0d707b0391..cf3069f828 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -35,6 +35,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void sve_interleaved_s8s32_dot_8x3VL( ARGLIST );
+void sve_interleaved_s8s32_dot_8x3VL_a64fx( ARGLIST );
class cls_sve_interleaved_s8s32_dot_8x3VL
{
@@ -55,11 +56,6 @@ public:
return get_vector_length<int32_t>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<int32_t>();
- }
-
static constexpr unsigned int k_unroll()
{
return 4;
@@ -80,6 +76,8 @@ public:
return { 63.30, 4.97, 11.35 };
case CPUModel::A510:
return { 27.42, 3.47, 2.88 };
+ case CPUModel::A64FX:
+ return { 109.18, 3.88, 7.85 };
}
}
@@ -92,6 +90,8 @@ public:
return { 52.24, 7.49, 0.80 };
case CPUModel::A510:
return { 27.47, 1.70, 0.28 };
+ case CPUModel::A64FX:
+ return { 109.92, 2.36, 0.41 };
}
}
@@ -100,13 +100,19 @@ public:
// Default to the generic kernel
kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
- cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
+ cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A64FX:
+ kernel=sve_interleaved_s8s32_dot_8x3VL_a64fx;
+ break;
+ }
}
};
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
index a7ca48d87a..c668a7b746 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_s8s32_dot_8x3VL_a64fx(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *Apanel,
+ const int8_t *Bpanel,
+ int32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -89,7 +93,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"sdot z9.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
"sdot z10.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"sdot z11.s, z0.b, z4.b\n"
"sdot z12.s, z1.b, z4.b\n"
"sdot z13.s, z2.b, z4.b\n"
@@ -98,63 +102,63 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"sdot z15.s, z1.b, z5.b\n"
"cmp x20, #0x2\n"
"sdot z16.s, z2.b, z5.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
"sdot z17.s, z0.b, z6.b\n"
"sdot z18.s, z1.b, z6.b\n"
"sdot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "sdot z20.s, z0.b, z3.b\n"
- "sdot z21.s, z1.b, z3.b\n"
- "sdot z22.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+ "sdot z20.s, z0.b, z7.b\n"
+ "sdot z21.s, z1.b, z7.b\n"
+ "sdot z22.s, z2.b, z7.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
"sdot z23.s, z0.b, z4.b\n"
"sdot z24.s, z1.b, z4.b\n"
"sdot z25.s, z2.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "sdot z26.s, z0.b, z5.b\n"
- "sdot z27.s, z1.b, z5.b\n"
- "sdot z28.s, z2.b, z5.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
- "sdot z29.s, z0.b, z6.b\n"
- "ld1b { z0.b }, p0/Z, [x22, #3, MUL VL]\n"
- "sdot z30.s, z1.b, z6.b\n"
- "sdot z31.s, z2.b, z6.b\n"
- "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
- "sdot z8.s, z0.b, z3.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
- "sdot z9.s, z1.b, z3.b\n"
- "sdot z10.s, z2.b, z3.b\n"
- "sdot z11.s, z0.b, z4.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
- "sdot z12.s, z1.b, z4.b\n"
- "sdot z13.s, z2.b, z4.b\n"
+ "sdot z26.s, z0.b, z3.b\n"
+ "sdot z27.s, z1.b, z3.b\n"
+ "sdot z28.s, z2.b, z3.b\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+ "sdot z29.s, z0.b, z5.b\n"
+ "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+ "sdot z30.s, z1.b, z5.b\n"
+ "sdot z31.s, z2.b, z5.b\n"
+ "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z7.b\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+ "sdot z9.s, z2.b, z7.b\n"
+ "sdot z10.s, z5.b, z7.b\n"
+ "sdot z11.s, z6.b, z4.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+ "sdot z12.s, z2.b, z4.b\n"
+ "sdot z13.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "sdot z14.s, z0.b, z5.b\n"
- "sdot z15.s, z1.b, z5.b\n"
+ "sdot z14.s, z6.b, z3.b\n"
+ "sdot z15.s, z2.b, z3.b\n"
"addvl x22, x22, #6\n"
- "sdot z16.s, z2.b, z5.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
- "sdot z17.s, z0.b, z6.b\n"
- "sdot z18.s, z1.b, z6.b\n"
- "sdot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+ "sdot z16.s, z5.b, z3.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+ "sdot z17.s, z6.b, z1.b\n"
+ "sdot z18.s, z2.b, z1.b\n"
+ "sdot z19.s, z5.b, z1.b\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "sdot z20.s, z0.b, z3.b\n"
- "sdot z21.s, z1.b, z3.b\n"
- "sdot z22.s, z2.b, z3.b\n"
- "sdot z23.s, z0.b, z4.b\n"
+ "sdot z20.s, z6.b, z7.b\n"
+ "sdot z21.s, z2.b, z7.b\n"
+ "sdot z22.s, z5.b, z7.b\n"
+ "sdot z23.s, z6.b, z4.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "sdot z24.s, z1.b, z4.b\n"
- "sdot z25.s, z2.b, z4.b\n"
+ "sdot z24.s, z2.b, z4.b\n"
+ "sdot z25.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "sdot z26.s, z0.b, z5.b\n"
- "sdot z27.s, z1.b, z5.b\n"
- "sdot z28.s, z2.b, z5.b\n"
- "sdot z29.s, z0.b, z6.b\n"
+ "sdot z26.s, z6.b, z0.b\n"
+ "sdot z27.s, z2.b, z0.b\n"
+ "sdot z28.s, z5.b, z0.b\n"
+ "sdot z29.s, z6.b, z1.b\n"
"ld1b { z0.b }, p0/Z, [x22]\n"
- "sdot z30.s, z1.b, z6.b\n"
- "sdot z31.s, z2.b, z6.b\n"
+ "sdot z30.s, z2.b, z1.b\n"
+ "sdot z31.s, z5.b, z1.b\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
"ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -165,7 +169,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"sdot z9.s, z1.b, z3.b\n"
"addvl x22, x22, #3\n"
"sdot z10.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"sdot z11.s, z0.b, z4.b\n"
"sdot z12.s, z1.b, z4.b\n"
"sdot z13.s, z2.b, z4.b\n"
@@ -177,58 +181,58 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"sdot z17.s, z0.b, z6.b\n"
"sdot z18.s, z1.b, z6.b\n"
"sdot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "sdot z20.s, z0.b, z3.b\n"
- "sdot z21.s, z1.b, z3.b\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+ "sdot z20.s, z0.b, z7.b\n"
+ "sdot z21.s, z1.b, z7.b\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "sdot z22.s, z2.b, z3.b\n"
+ "sdot z22.s, z2.b, z7.b\n"
"sdot z23.s, z0.b, z4.b\n"
"sdot z24.s, z1.b, z4.b\n"
"sdot z25.s, z2.b, z4.b\n"
"sdot z26.s, z0.b, z5.b\n"
"sdot z27.s, z1.b, z5.b\n"
"sdot z28.s, z2.b, z5.b\n"
- "sdot z29.s, z0.b, z6.b\n"
- "sdot z30.s, z1.b, z6.b\n"
- "sdot z31.s, z2.b, z6.b\n"
+ "sdot z29.s, z0.b, z3.b\n"
+ "sdot z30.s, z1.b, z3.b\n"
+ "sdot z31.s, z2.b, z3.b\n"
"cbz x20, 5f\n"
- "ld1b { z0.b }, p0/Z, [x22]\n"
- "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z6.b }, p0/Z, [x22]\n"
+ "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "sdot z8.s, z0.b, z3.b\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
- "sdot z9.s, z1.b, z3.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
- "sdot z10.s, z2.b, z3.b\n"
- "sdot z11.s, z0.b, z4.b\n"
- "sdot z12.s, z1.b, z4.b\n"
- "sdot z13.s, z2.b, z4.b\n"
+ "sdot z8.s, z6.b, z3.b\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+ "sdot z9.s, z5.b, z3.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+ "sdot z10.s, z4.b, z3.b\n"
+ "sdot z11.s, z6.b, z2.b\n"
+ "sdot z12.s, z5.b, z2.b\n"
+ "sdot z13.s, z4.b, z2.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
- "sdot z14.s, z0.b, z5.b\n"
- "sdot z15.s, z1.b, z5.b\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
- "sdot z16.s, z2.b, z5.b\n"
- "sdot z17.s, z0.b, z6.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
- "sdot z18.s, z1.b, z6.b\n"
- "sdot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "sdot z20.s, z0.b, z3.b\n"
- "sdot z21.s, z1.b, z3.b\n"
+ "sdot z14.s, z6.b, z1.b\n"
+ "sdot z15.s, z5.b, z1.b\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "sdot z16.s, z4.b, z1.b\n"
+ "sdot z17.s, z6.b, z0.b\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+ "sdot z18.s, z5.b, z0.b\n"
+ "sdot z19.s, z4.b, z0.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+ "sdot z20.s, z6.b, z3.b\n"
+ "sdot z21.s, z5.b, z3.b\n"
"addvl x22, x22, #3\n"
- "sdot z22.s, z2.b, z3.b\n"
- "sdot z23.s, z0.b, z4.b\n"
+ "sdot z22.s, z4.b, z3.b\n"
+ "sdot z23.s, z6.b, z2.b\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "sdot z24.s, z1.b, z4.b\n"
- "sdot z25.s, z2.b, z4.b\n"
- "sdot z26.s, z0.b, z5.b\n"
- "sdot z27.s, z1.b, z5.b\n"
- "sdot z28.s, z2.b, z5.b\n"
- "sdot z29.s, z0.b, z6.b\n"
- "sdot z30.s, z1.b, z6.b\n"
- "sdot z31.s, z2.b, z6.b\n"
+ "sdot z24.s, z5.b, z2.b\n"
+ "sdot z25.s, z4.b, z2.b\n"
+ "sdot z26.s, z6.b, z1.b\n"
+ "sdot z27.s, z5.b, z1.b\n"
+ "sdot z28.s, z4.b, z1.b\n"
+ "sdot z29.s, z6.b, z0.b\n"
+ "sdot z30.s, z5.b, z0.b\n"
+ "sdot z31.s, z4.b, z0.b\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
@@ -262,7 +266,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx(
"bne 1b\n"
: [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
: [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index e5f59d220b..f6e1a75c15 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_s8s32_dot_8x3VL(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *Apanel,
+ const int8_t *Bpanel,
+ int32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,10 +89,10 @@ void sve_interleaved_s8s32_dot_8x3VL(
"3:" // main loop head
"sdot z8.s, z4.b, z0.b[0]\n"
"sdot z11.s, z4.b, z0.b[1]\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n"
"sdot z14.s, z4.b, z0.b[2]\n"
"sdot z17.s, z4.b, z0.b[3]\n"
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n"
"sdot z20.s, z4.b, z1.b[0]\n"
"sdot z23.s, z4.b, z1.b[1]\n"
"sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@ void sve_interleaved_s8s32_dot_8x3VL(
"sdot z25.s, z6.b, z1.b[1]\n"
"sdot z28.s, z6.b, z1.b[2]\n"
"sdot z31.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p0/Z, [x22, #5, MUL VL]\n"
+ "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
"addvl x22, x22, #6\n"
- "sdot z8.s, z4.b, z2.b[0]\n"
- "sdot z11.s, z4.b, z2.b[1]\n"
+ "sdot z8.s, z4.b, z3.b[0]\n"
+ "sdot z11.s, z4.b, z3.b[1]\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
- "sdot z14.s, z4.b, z2.b[2]\n"
- "sdot z17.s, z4.b, z2.b[3]\n"
- "sdot z20.s, z4.b, z3.b[0]\n"
- "sdot z23.s, z4.b, z3.b[1]\n"
- "sdot z26.s, z4.b, z3.b[2]\n"
- "sdot z29.s, z4.b, z3.b[3]\n"
+ "sdot z14.s, z4.b, z3.b[2]\n"
+ "sdot z17.s, z4.b, z3.b[3]\n"
+ "sdot z20.s, z4.b, z7.b[0]\n"
+ "sdot z23.s, z4.b, z7.b[1]\n"
+ "sdot z26.s, z4.b, z7.b[2]\n"
+ "sdot z29.s, z4.b, z7.b[3]\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
- "sdot z9.s, z5.b, z2.b[0]\n"
- "sdot z12.s, z5.b, z2.b[1]\n"
- "sdot z15.s, z5.b, z2.b[2]\n"
- "sdot z18.s, z5.b, z2.b[3]\n"
- "sdot z21.s, z5.b, z3.b[0]\n"
- "sdot z24.s, z5.b, z3.b[1]\n"
- "sdot z27.s, z5.b, z3.b[2]\n"
- "sdot z30.s, z5.b, z3.b[3]\n"
+ "sdot z9.s, z5.b, z3.b[0]\n"
+ "sdot z12.s, z5.b, z3.b[1]\n"
+ "sdot z15.s, z5.b, z3.b[2]\n"
+ "sdot z18.s, z5.b, z3.b[3]\n"
+ "sdot z21.s, z5.b, z7.b[0]\n"
+ "sdot z24.s, z5.b, z7.b[1]\n"
+ "sdot z27.s, z5.b, z7.b[2]\n"
+ "sdot z30.s, z5.b, z7.b[3]\n"
"ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "sdot z10.s, z6.b, z2.b[0]\n"
- "sdot z13.s, z6.b, z2.b[1]\n"
- "sdot z16.s, z6.b, z2.b[2]\n"
- "sdot z19.s, z6.b, z2.b[3]\n"
- "sdot z22.s, z6.b, z3.b[0]\n"
- "sdot z25.s, z6.b, z3.b[1]\n"
- "sdot z28.s, z6.b, z3.b[2]\n"
- "sdot z31.s, z6.b, z3.b[3]\n"
+ "sdot z10.s, z2.b, z3.b[0]\n"
+ "sdot z13.s, z2.b, z3.b[1]\n"
+ "sdot z16.s, z2.b, z3.b[2]\n"
+ "sdot z19.s, z2.b, z3.b[3]\n"
+ "sdot z22.s, z2.b, z7.b[0]\n"
+ "sdot z25.s, z2.b, z7.b[1]\n"
+ "sdot z28.s, z2.b, z7.b[2]\n"
+ "sdot z31.s, z2.b, z7.b[3]\n"
"ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
@@ -174,37 +178,37 @@ void sve_interleaved_s8s32_dot_8x3VL(
"sdot z28.s, z6.b, z1.b[2]\n"
"sdot z31.s, z6.b, z1.b[3]\n"
"cbz x20, 5f\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1b { z7.b }, p0/Z, [x22]\n"
- "ld1b { z4.b }, p0/Z, [x22, #1, MUL VL]\n"
- "sdot z8.s, z7.b, z0.b[0]\n"
- "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
- "sdot z11.s, z7.b, z0.b[1]\n"
- "sdot z14.s, z7.b, z0.b[2]\n"
- "sdot z17.s, z7.b, z0.b[3]\n"
- "sdot z20.s, z7.b, z1.b[0]\n"
+ "ld1b { z2.b }, p0/Z, [x22]\n"
+ "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "sdot z8.s, z2.b, z4.b[0]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "sdot z11.s, z2.b, z4.b[1]\n"
+ "sdot z14.s, z2.b, z4.b[2]\n"
+ "sdot z17.s, z2.b, z4.b[3]\n"
+ "sdot z20.s, z2.b, z3.b[0]\n"
"addvl x22, x22, #3\n"
- "sdot z23.s, z7.b, z1.b[1]\n"
- "sdot z26.s, z7.b, z1.b[2]\n"
- "sdot z29.s, z7.b, z1.b[3]\n"
- "sdot z9.s, z4.b, z0.b[0]\n"
- "sdot z12.s, z4.b, z0.b[1]\n"
- "sdot z15.s, z4.b, z0.b[2]\n"
- "sdot z18.s, z4.b, z0.b[3]\n"
- "sdot z21.s, z4.b, z1.b[0]\n"
- "sdot z24.s, z4.b, z1.b[1]\n"
- "sdot z27.s, z4.b, z1.b[2]\n"
- "sdot z30.s, z4.b, z1.b[3]\n"
- "sdot z10.s, z5.b, z0.b[0]\n"
- "sdot z13.s, z5.b, z0.b[1]\n"
- "sdot z16.s, z5.b, z0.b[2]\n"
- "sdot z19.s, z5.b, z0.b[3]\n"
- "sdot z22.s, z5.b, z1.b[0]\n"
- "sdot z25.s, z5.b, z1.b[1]\n"
- "sdot z28.s, z5.b, z1.b[2]\n"
- "sdot z31.s, z5.b, z1.b[3]\n"
+ "sdot z23.s, z2.b, z3.b[1]\n"
+ "sdot z26.s, z2.b, z3.b[2]\n"
+ "sdot z29.s, z2.b, z3.b[3]\n"
+ "sdot z9.s, z1.b, z4.b[0]\n"
+ "sdot z12.s, z1.b, z4.b[1]\n"
+ "sdot z15.s, z1.b, z4.b[2]\n"
+ "sdot z18.s, z1.b, z4.b[3]\n"
+ "sdot z21.s, z1.b, z3.b[0]\n"
+ "sdot z24.s, z1.b, z3.b[1]\n"
+ "sdot z27.s, z1.b, z3.b[2]\n"
+ "sdot z30.s, z1.b, z3.b[3]\n"
+ "sdot z10.s, z0.b, z4.b[0]\n"
+ "sdot z13.s, z0.b, z4.b[1]\n"
+ "sdot z16.s, z0.b, z4.b[2]\n"
+ "sdot z19.s, z0.b, z4.b[3]\n"
+ "sdot z22.s, z0.b, z3.b[0]\n"
+ "sdot z25.s, z0.b, z3.b[1]\n"
+ "sdot z28.s, z0.b, z3.b[2]\n"
+ "sdot z31.s, z0.b, z3.b[3]\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 4e65296f8b..82734abfbe 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -55,11 +55,6 @@ public:
return get_vector_length<int32_t>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<int32_t>();
- }
-
static constexpr unsigned int k_unroll()
{
return 8;
@@ -89,7 +84,7 @@ public:
default:
return { 61.97, 3.64, 0.50 };
case CPUModel::V1:
- return { 95.28, 7.99, 0.79 };
+ return { 95.28, 7.99, 0.79 };
case CPUModel::A510:
return { 43.36, 1.86, 0.28 };
}
@@ -108,5 +103,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index 104d5f918e..bfed5000fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_s8s32_mmla_8x3VL(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *Apanel,
+ const int8_t *Bpanel,
+ int32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,82 +89,82 @@ void sve_interleaved_s8s32_mmla_8x3VL(
"mov z31.s, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
- "ld1b { z6.b }, p0/Z, [x22]\n"
+ "ld1b { z7.b }, p0/Z, [x22]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
- ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45069809 // smmla z9.s, z0.b, z6.b\n"
- ".inst 0x4507980c // smmla z12.s, z0.b, z7.b\n"
- ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x450498da // smmla z26.s, z6.b, z4.b\n"
+ ".inst 0x450598dd // smmla z29.s, z6.b, z5.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
+ ".inst 0x4503980c // smmla z12.s, z0.b, z3.b\n"
+ ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
+ ".inst 0x45039832 // smmla z18.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
- ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x45079858 // smmla z24.s, z2.b, z7.b\n"
+ ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
+ ".inst 0x45039858 // smmla z24.s, z2.b, z3.b\n"
"cmp x20, #0x2\n"
- ".inst 0x4506987b // smmla z27.s, z3.b, z6.b\n"
- ".inst 0x4507987e // smmla z30.s, z3.b, z7.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
- ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
+ ".inst 0x450798db // smmla z27.s, z6.b, z7.b\n"
+ ".inst 0x450398de // smmla z30.s, z6.b, z3.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
+ ".inst 0x4505980a // smmla z10.s, z0.b, z5.b\n"
+ ".inst 0x4504980d // smmla z13.s, z0.b, z4.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
+ ".inst 0x45059830 // smmla z16.s, z1.b, z5.b\n"
+ ".inst 0x45049833 // smmla z19.s, z1.b, z4.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
- ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
+ ".inst 0x45059856 // smmla z22.s, z2.b, z5.b\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
"ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
- ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n"
- "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
- ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
+ ".inst 0x450598dc // smmla z28.s, z6.b, z5.b\n"
+ ".inst 0x450498df // smmla z31.s, z6.b, z4.b\n"
+ "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
+ "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
+ ".inst 0x45039808 // smmla z8.s, z0.b, z3.b\n"
+ "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
"addvl x22, x22, #16\n"
".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
+ ".inst 0x4503982e // smmla z14.s, z1.b, z3.b\n"
".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
- ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
- ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #-8, MUL VL]\n"
+ ".inst 0x450398b4 // smmla z20.s, z5.b, z3.b\n"
+ ".inst 0x450798b7 // smmla z23.s, z5.b, z7.b\n"
+ ".inst 0x450398da // smmla z26.s, z6.b, z3.b\n"
+ ".inst 0x450798dd // smmla z29.s, z6.b, z7.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
"ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x45049809 // smmla z9.s, z0.b, z4.b\n"
- ".inst 0x4505980c // smmla z12.s, z0.b, z5.b\n"
- ".inst 0x4504982f // smmla z15.s, z1.b, z4.b\n"
- ".inst 0x45059832 // smmla z18.s, z1.b, z5.b\n"
- ".inst 0x45049855 // smmla z21.s, z2.b, z4.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- ".inst 0x4504987b // smmla z27.s, z3.b, z4.b\n"
- ".inst 0x4505987e // smmla z30.s, z3.b, z5.b\n"
+ ".inst 0x45029809 // smmla z9.s, z0.b, z2.b\n"
+ ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
+ ".inst 0x4502982f // smmla z15.s, z1.b, z2.b\n"
+ ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
+ ".inst 0x450298b5 // smmla z21.s, z5.b, z2.b\n"
+ ".inst 0x450498b8 // smmla z24.s, z5.b, z4.b\n"
+ ".inst 0x450298db // smmla z27.s, z6.b, z2.b\n"
+ ".inst 0x450498de // smmla z30.s, z6.b, z4.b\n"
"ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
+ ".inst 0x4503980a // smmla z10.s, z0.b, z3.b\n"
".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
+ ".inst 0x45039830 // smmla z16.s, z1.b, z3.b\n"
".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
+ ".inst 0x450398b6 // smmla z22.s, z5.b, z3.b\n"
+ ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n"
"ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
- ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
- ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
+ ".inst 0x450398dc // smmla z28.s, z6.b, z3.b\n"
+ ".inst 0x450798df // smmla z31.s, z6.b, z7.b\n"
"ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x22, x22, #-4\n"
"bge 3b\n"
"4:" // main loop skip
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
@@ -168,114 +172,114 @@ void sve_interleaved_s8s32_mmla_8x3VL(
"ld1b { z6.b }, p0/Z, [x22]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
- ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x450498fa // smmla z26.s, z7.b, z4.b\n"
+ ".inst 0x450598fd // smmla z29.s, z7.b, z5.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x45069809 // smmla z9.s, z0.b, z6.b\n"
- ".inst 0x4507980c // smmla z12.s, z0.b, z7.b\n"
+ ".inst 0x4503980c // smmla z12.s, z0.b, z3.b\n"
".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n"
- ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n"
+ ".inst 0x45039832 // smmla z18.s, z1.b, z3.b\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n"
- ".inst 0x45079858 // smmla z24.s, z2.b, z7.b\n"
+ ".inst 0x45039858 // smmla z24.s, z2.b, z3.b\n"
"addvl x22, x22, #4\n"
- ".inst 0x4506987b // smmla z27.s, z3.b, z6.b\n"
- ".inst 0x4507987e // smmla z30.s, z3.b, z7.b\n"
- ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
- ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
- ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
- ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
- ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
- ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
- ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
- ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
+ ".inst 0x450698fb // smmla z27.s, z7.b, z6.b\n"
+ ".inst 0x450398fe // smmla z30.s, z7.b, z3.b\n"
+ ".inst 0x4505980a // smmla z10.s, z0.b, z5.b\n"
+ ".inst 0x4504980d // smmla z13.s, z0.b, z4.b\n"
+ ".inst 0x45059830 // smmla z16.s, z1.b, z5.b\n"
+ ".inst 0x45049833 // smmla z19.s, z1.b, z4.b\n"
+ ".inst 0x45059856 // smmla z22.s, z2.b, z5.b\n"
+ ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n"
+ ".inst 0x450598fc // smmla z28.s, z7.b, z5.b\n"
+ ".inst 0x450498ff // smmla z31.s, z7.b, z4.b\n"
"cbz x20, 5f\n"
- "ld1b { z6.b }, p0/Z, [x22]\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
- ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
- ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
- ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x45049809 // smmla z9.s, z0.b, z4.b\n"
- ".inst 0x4505980c // smmla z12.s, z0.b, z5.b\n"
+ "ld1b { z1.b }, p0/Z, [x22]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n"
+ "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x450098eb // smmla z11.s, z7.b, z0.b\n"
+ "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
+ ".inst 0x450198ce // smmla z14.s, z6.b, z1.b\n"
+ ".inst 0x450098d1 // smmla z17.s, z6.b, z0.b\n"
+ ".inst 0x450198b4 // smmla z20.s, z5.b, z1.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n"
+ ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n"
+ "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x450398e9 // smmla z9.s, z7.b, z3.b\n"
+ ".inst 0x450298ec // smmla z12.s, z7.b, z2.b\n"
"addvl x22, x22, #6\n"
- ".inst 0x4504982f // smmla z15.s, z1.b, z4.b\n"
- ".inst 0x45059832 // smmla z18.s, z1.b, z5.b\n"
+ ".inst 0x450398cf // smmla z15.s, z6.b, z3.b\n"
+ ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x45049855 // smmla z21.s, z2.b, z4.b\n"
- ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n"
- ".inst 0x4504987b // smmla z27.s, z3.b, z4.b\n"
- ".inst 0x4505987e // smmla z30.s, z3.b, z5.b\n"
- ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
- ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
- ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
- ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
- ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
- ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
- ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
- ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
+ ".inst 0x450398b5 // smmla z21.s, z5.b, z3.b\n"
+ ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n"
+ ".inst 0x4503989b // smmla z27.s, z4.b, z3.b\n"
+ ".inst 0x4502989e // smmla z30.s, z4.b, z2.b\n"
+ ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n"
+ ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n"
+ ".inst 0x450198d0 // smmla z16.s, z6.b, z1.b\n"
+ ".inst 0x450098d3 // smmla z19.s, z6.b, z0.b\n"
+ ".inst 0x450198b6 // smmla z22.s, z5.b, z1.b\n"
+ ".inst 0x450098b9 // smmla z25.s, z5.b, z0.b\n"
+ ".inst 0x4501989c // smmla z28.s, z4.b, z1.b\n"
+ ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n"
"5:" // multiply loop done
- "uzp1 z4.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "st1w { z4.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z11.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
- "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z12.d, z10.d, z13.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+ "uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z13.d, z14.d, z17.d\n"
+ "uzp1 z0.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
"st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
"subs x23, x23, #0x1\n"
"st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "uzp1 z18.d, z16.d, z19.d\n"
- "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "uzp1 z19.d, z20.d, z23.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
"addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
"uzp1 z23.d, z21.d, z24.d\n"
"uzp2 z21.d, z21.d, z24.d\n"
"st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "uzp1 z24.d, z22.d, z25.d\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
"st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "uzp1 z25.d, z26.d, z29.d\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
"st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "uzp1 z29.d, z27.d, z30.d\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "uzp1 z30.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
"st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
@@ -290,4 +294,4 @@ void sve_interleaved_s8s32_mmla_8x3VL(
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 0afcdd2ce4..c0b215ccb4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -35,6 +35,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void sve_interleaved_u8u32_dot_8x3VL( ARGLIST );
+void sve_interleaved_u8u32_dot_8x3VL_a64fx( ARGLIST );
class cls_sve_interleaved_u8u32_dot_8x3VL
{
@@ -55,11 +56,6 @@ public:
return get_vector_length<uint32_t>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<uint32_t>();
- }
-
static constexpr unsigned int k_unroll()
{
return 4;
@@ -80,6 +76,8 @@ public:
return { 27.44, 3.41, 2.90 };
case CPUModel::V1:
return { 63.30, 4.97, 11.52 };
+ case CPUModel::A64FX:
+ return { 109.76, 3.88, 6.76 };
}
}
@@ -92,6 +90,8 @@ public:
return { 27.45, 1.65, 0.28 };
case CPUModel::V1:
return { 52.24, 7.49, 0.80 };
+ case CPUModel::A64FX:
+ return { 110.18, 2.34, 0.40 };
}
}
@@ -100,13 +100,19 @@ public:
// Default to the generic kernel
kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
- cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
+ cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A64FX:
+ kernel=sve_interleaved_u8u32_dot_8x3VL_a64fx;
+ break;
+ }
}
};
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
index 2bfec8f350..79e794a834 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_u8u32_dot_8x3VL_a64fx(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *Apanel,
+ const uint8_t *Bpanel,
+ uint32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -89,7 +93,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"udot z9.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
"udot z10.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"udot z11.s, z0.b, z4.b\n"
"udot z12.s, z1.b, z4.b\n"
"udot z13.s, z2.b, z4.b\n"
@@ -98,63 +102,63 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"udot z15.s, z1.b, z5.b\n"
"cmp x20, #0x2\n"
"udot z16.s, z2.b, z5.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
"udot z17.s, z0.b, z6.b\n"
"udot z18.s, z1.b, z6.b\n"
"udot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "udot z20.s, z0.b, z3.b\n"
- "udot z21.s, z1.b, z3.b\n"
- "udot z22.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+ "udot z20.s, z0.b, z7.b\n"
+ "udot z21.s, z1.b, z7.b\n"
+ "udot z22.s, z2.b, z7.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
"udot z23.s, z0.b, z4.b\n"
"udot z24.s, z1.b, z4.b\n"
"udot z25.s, z2.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
- "udot z26.s, z0.b, z5.b\n"
- "udot z27.s, z1.b, z5.b\n"
- "udot z28.s, z2.b, z5.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
- "udot z29.s, z0.b, z6.b\n"
- "ld1b { z0.b }, p0/Z, [x22, #3, MUL VL]\n"
- "udot z30.s, z1.b, z6.b\n"
- "udot z31.s, z2.b, z6.b\n"
- "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
- "udot z8.s, z0.b, z3.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
- "udot z9.s, z1.b, z3.b\n"
- "udot z10.s, z2.b, z3.b\n"
- "udot z11.s, z0.b, z4.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
- "udot z12.s, z1.b, z4.b\n"
- "udot z13.s, z2.b, z4.b\n"
+ "udot z26.s, z0.b, z3.b\n"
+ "udot z27.s, z1.b, z3.b\n"
+ "udot z28.s, z2.b, z3.b\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+ "udot z29.s, z0.b, z5.b\n"
+ "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+ "udot z30.s, z1.b, z5.b\n"
+ "udot z31.s, z2.b, z5.b\n"
+ "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z7.b\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+ "udot z9.s, z2.b, z7.b\n"
+ "udot z10.s, z5.b, z7.b\n"
+ "udot z11.s, z6.b, z4.b\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+ "udot z12.s, z2.b, z4.b\n"
+ "udot z13.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
- "udot z14.s, z0.b, z5.b\n"
- "udot z15.s, z1.b, z5.b\n"
+ "udot z14.s, z6.b, z3.b\n"
+ "udot z15.s, z2.b, z3.b\n"
"addvl x22, x22, #6\n"
- "udot z16.s, z2.b, z5.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
- "udot z17.s, z0.b, z6.b\n"
- "udot z18.s, z1.b, z6.b\n"
- "udot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+ "udot z16.s, z5.b, z3.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+ "udot z17.s, z6.b, z1.b\n"
+ "udot z18.s, z2.b, z1.b\n"
+ "udot z19.s, z5.b, z1.b\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- "udot z20.s, z0.b, z3.b\n"
- "udot z21.s, z1.b, z3.b\n"
- "udot z22.s, z2.b, z3.b\n"
- "udot z23.s, z0.b, z4.b\n"
+ "udot z20.s, z6.b, z7.b\n"
+ "udot z21.s, z2.b, z7.b\n"
+ "udot z22.s, z5.b, z7.b\n"
+ "udot z23.s, z6.b, z4.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "udot z24.s, z1.b, z4.b\n"
- "udot z25.s, z2.b, z4.b\n"
+ "udot z24.s, z2.b, z4.b\n"
+ "udot z25.s, z5.b, z4.b\n"
"ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "udot z26.s, z0.b, z5.b\n"
- "udot z27.s, z1.b, z5.b\n"
- "udot z28.s, z2.b, z5.b\n"
- "udot z29.s, z0.b, z6.b\n"
+ "udot z26.s, z6.b, z0.b\n"
+ "udot z27.s, z2.b, z0.b\n"
+ "udot z28.s, z5.b, z0.b\n"
+ "udot z29.s, z6.b, z1.b\n"
"ld1b { z0.b }, p0/Z, [x22]\n"
- "udot z30.s, z1.b, z6.b\n"
- "udot z31.s, z2.b, z6.b\n"
+ "udot z30.s, z2.b, z1.b\n"
+ "udot z31.s, z5.b, z1.b\n"
"ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
"ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -165,7 +169,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"udot z9.s, z1.b, z3.b\n"
"addvl x22, x22, #3\n"
"udot z10.s, z2.b, z3.b\n"
- "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
"udot z11.s, z0.b, z4.b\n"
"udot z12.s, z1.b, z4.b\n"
"udot z13.s, z2.b, z4.b\n"
@@ -177,58 +181,58 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"udot z17.s, z0.b, z6.b\n"
"udot z18.s, z1.b, z6.b\n"
"udot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "udot z20.s, z0.b, z3.b\n"
- "udot z21.s, z1.b, z3.b\n"
+ "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+ "udot z20.s, z0.b, z7.b\n"
+ "udot z21.s, z1.b, z7.b\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "udot z22.s, z2.b, z3.b\n"
+ "udot z22.s, z2.b, z7.b\n"
"udot z23.s, z0.b, z4.b\n"
"udot z24.s, z1.b, z4.b\n"
"udot z25.s, z2.b, z4.b\n"
"udot z26.s, z0.b, z5.b\n"
"udot z27.s, z1.b, z5.b\n"
"udot z28.s, z2.b, z5.b\n"
- "udot z29.s, z0.b, z6.b\n"
- "udot z30.s, z1.b, z6.b\n"
- "udot z31.s, z2.b, z6.b\n"
+ "udot z29.s, z0.b, z3.b\n"
+ "udot z30.s, z1.b, z3.b\n"
+ "udot z31.s, z2.b, z3.b\n"
"cbz x20, 5f\n"
- "ld1b { z0.b }, p0/Z, [x22]\n"
- "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
- "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z6.b }, p0/Z, [x22]\n"
+ "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
- "udot z8.s, z0.b, z3.b\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
- "udot z9.s, z1.b, z3.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
- "udot z10.s, z2.b, z3.b\n"
- "udot z11.s, z0.b, z4.b\n"
- "udot z12.s, z1.b, z4.b\n"
- "udot z13.s, z2.b, z4.b\n"
+ "udot z8.s, z6.b, z3.b\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+ "udot z9.s, z5.b, z3.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+ "udot z10.s, z4.b, z3.b\n"
+ "udot z11.s, z6.b, z2.b\n"
+ "udot z12.s, z5.b, z2.b\n"
+ "udot z13.s, z4.b, z2.b\n"
"ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
- "udot z14.s, z0.b, z5.b\n"
- "udot z15.s, z1.b, z5.b\n"
- "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
- "udot z16.s, z2.b, z5.b\n"
- "udot z17.s, z0.b, z6.b\n"
- "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
- "udot z18.s, z1.b, z6.b\n"
- "udot z19.s, z2.b, z6.b\n"
- "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
- "udot z20.s, z0.b, z3.b\n"
- "udot z21.s, z1.b, z3.b\n"
+ "udot z14.s, z6.b, z1.b\n"
+ "udot z15.s, z5.b, z1.b\n"
+ "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+ "udot z16.s, z4.b, z1.b\n"
+ "udot z17.s, z6.b, z0.b\n"
+ "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+ "udot z18.s, z5.b, z0.b\n"
+ "udot z19.s, z4.b, z0.b\n"
+ "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+ "udot z20.s, z6.b, z3.b\n"
+ "udot z21.s, z5.b, z3.b\n"
"addvl x22, x22, #3\n"
- "udot z22.s, z2.b, z3.b\n"
- "udot z23.s, z0.b, z4.b\n"
+ "udot z22.s, z4.b, z3.b\n"
+ "udot z23.s, z6.b, z2.b\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "udot z24.s, z1.b, z4.b\n"
- "udot z25.s, z2.b, z4.b\n"
- "udot z26.s, z0.b, z5.b\n"
- "udot z27.s, z1.b, z5.b\n"
- "udot z28.s, z2.b, z5.b\n"
- "udot z29.s, z0.b, z6.b\n"
- "udot z30.s, z1.b, z6.b\n"
- "udot z31.s, z2.b, z6.b\n"
+ "udot z24.s, z5.b, z2.b\n"
+ "udot z25.s, z4.b, z2.b\n"
+ "udot z26.s, z6.b, z1.b\n"
+ "udot z27.s, z5.b, z1.b\n"
+ "udot z28.s, z4.b, z1.b\n"
+ "udot z29.s, z6.b, z0.b\n"
+ "udot z30.s, z5.b, z0.b\n"
+ "udot z31.s, z4.b, z0.b\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
@@ -262,7 +266,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx(
"bne 1b\n"
: [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
: [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 99fff4e83d..1c88336c2d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_u8u32_dot_8x3VL(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *Apanel,
+ const uint8_t *Bpanel,
+ uint32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,10 +89,10 @@ void sve_interleaved_u8u32_dot_8x3VL(
"3:" // main loop head
"udot z8.s, z4.b, z0.b[0]\n"
"udot z11.s, z4.b, z0.b[1]\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n"
"udot z14.s, z4.b, z0.b[2]\n"
"udot z17.s, z4.b, z0.b[3]\n"
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n"
"udot z20.s, z4.b, z1.b[0]\n"
"udot z23.s, z4.b, z1.b[1]\n"
"sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@ void sve_interleaved_u8u32_dot_8x3VL(
"udot z25.s, z6.b, z1.b[1]\n"
"udot z28.s, z6.b, z1.b[2]\n"
"udot z31.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p0/Z, [x22, #5, MUL VL]\n"
+ "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
"addvl x22, x22, #6\n"
- "udot z8.s, z4.b, z2.b[0]\n"
- "udot z11.s, z4.b, z2.b[1]\n"
+ "udot z8.s, z4.b, z3.b[0]\n"
+ "udot z11.s, z4.b, z3.b[1]\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
- "udot z14.s, z4.b, z2.b[2]\n"
- "udot z17.s, z4.b, z2.b[3]\n"
- "udot z20.s, z4.b, z3.b[0]\n"
- "udot z23.s, z4.b, z3.b[1]\n"
- "udot z26.s, z4.b, z3.b[2]\n"
- "udot z29.s, z4.b, z3.b[3]\n"
+ "udot z14.s, z4.b, z3.b[2]\n"
+ "udot z17.s, z4.b, z3.b[3]\n"
+ "udot z20.s, z4.b, z7.b[0]\n"
+ "udot z23.s, z4.b, z7.b[1]\n"
+ "udot z26.s, z4.b, z7.b[2]\n"
+ "udot z29.s, z4.b, z7.b[3]\n"
"ld1b { z4.b }, p0/Z, [x22]\n"
- "udot z9.s, z5.b, z2.b[0]\n"
- "udot z12.s, z5.b, z2.b[1]\n"
- "udot z15.s, z5.b, z2.b[2]\n"
- "udot z18.s, z5.b, z2.b[3]\n"
- "udot z21.s, z5.b, z3.b[0]\n"
- "udot z24.s, z5.b, z3.b[1]\n"
- "udot z27.s, z5.b, z3.b[2]\n"
- "udot z30.s, z5.b, z3.b[3]\n"
+ "udot z9.s, z5.b, z3.b[0]\n"
+ "udot z12.s, z5.b, z3.b[1]\n"
+ "udot z15.s, z5.b, z3.b[2]\n"
+ "udot z18.s, z5.b, z3.b[3]\n"
+ "udot z21.s, z5.b, z7.b[0]\n"
+ "udot z24.s, z5.b, z7.b[1]\n"
+ "udot z27.s, z5.b, z7.b[2]\n"
+ "udot z30.s, z5.b, z7.b[3]\n"
"ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
- "udot z10.s, z6.b, z2.b[0]\n"
- "udot z13.s, z6.b, z2.b[1]\n"
- "udot z16.s, z6.b, z2.b[2]\n"
- "udot z19.s, z6.b, z2.b[3]\n"
- "udot z22.s, z6.b, z3.b[0]\n"
- "udot z25.s, z6.b, z3.b[1]\n"
- "udot z28.s, z6.b, z3.b[2]\n"
- "udot z31.s, z6.b, z3.b[3]\n"
+ "udot z10.s, z2.b, z3.b[0]\n"
+ "udot z13.s, z2.b, z3.b[1]\n"
+ "udot z16.s, z2.b, z3.b[2]\n"
+ "udot z19.s, z2.b, z3.b[3]\n"
+ "udot z22.s, z2.b, z7.b[0]\n"
+ "udot z25.s, z2.b, z7.b[1]\n"
+ "udot z28.s, z2.b, z7.b[2]\n"
+ "udot z31.s, z2.b, z7.b[3]\n"
"ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
@@ -174,37 +178,37 @@ void sve_interleaved_u8u32_dot_8x3VL(
"udot z28.s, z6.b, z1.b[2]\n"
"udot z31.s, z6.b, z1.b[3]\n"
"cbz x20, 5f\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1b { z7.b }, p0/Z, [x22]\n"
- "ld1b { z4.b }, p0/Z, [x22, #1, MUL VL]\n"
- "udot z8.s, z7.b, z0.b[0]\n"
- "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
- "udot z11.s, z7.b, z0.b[1]\n"
- "udot z14.s, z7.b, z0.b[2]\n"
- "udot z17.s, z7.b, z0.b[3]\n"
- "udot z20.s, z7.b, z1.b[0]\n"
+ "ld1b { z2.b }, p0/Z, [x22]\n"
+ "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+ "udot z8.s, z2.b, z4.b[0]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "udot z11.s, z2.b, z4.b[1]\n"
+ "udot z14.s, z2.b, z4.b[2]\n"
+ "udot z17.s, z2.b, z4.b[3]\n"
+ "udot z20.s, z2.b, z3.b[0]\n"
"addvl x22, x22, #3\n"
- "udot z23.s, z7.b, z1.b[1]\n"
- "udot z26.s, z7.b, z1.b[2]\n"
- "udot z29.s, z7.b, z1.b[3]\n"
- "udot z9.s, z4.b, z0.b[0]\n"
- "udot z12.s, z4.b, z0.b[1]\n"
- "udot z15.s, z4.b, z0.b[2]\n"
- "udot z18.s, z4.b, z0.b[3]\n"
- "udot z21.s, z4.b, z1.b[0]\n"
- "udot z24.s, z4.b, z1.b[1]\n"
- "udot z27.s, z4.b, z1.b[2]\n"
- "udot z30.s, z4.b, z1.b[3]\n"
- "udot z10.s, z5.b, z0.b[0]\n"
- "udot z13.s, z5.b, z0.b[1]\n"
- "udot z16.s, z5.b, z0.b[2]\n"
- "udot z19.s, z5.b, z0.b[3]\n"
- "udot z22.s, z5.b, z1.b[0]\n"
- "udot z25.s, z5.b, z1.b[1]\n"
- "udot z28.s, z5.b, z1.b[2]\n"
- "udot z31.s, z5.b, z1.b[3]\n"
+ "udot z23.s, z2.b, z3.b[1]\n"
+ "udot z26.s, z2.b, z3.b[2]\n"
+ "udot z29.s, z2.b, z3.b[3]\n"
+ "udot z9.s, z1.b, z4.b[0]\n"
+ "udot z12.s, z1.b, z4.b[1]\n"
+ "udot z15.s, z1.b, z4.b[2]\n"
+ "udot z18.s, z1.b, z4.b[3]\n"
+ "udot z21.s, z1.b, z3.b[0]\n"
+ "udot z24.s, z1.b, z3.b[1]\n"
+ "udot z27.s, z1.b, z3.b[2]\n"
+ "udot z30.s, z1.b, z3.b[3]\n"
+ "udot z10.s, z0.b, z4.b[0]\n"
+ "udot z13.s, z0.b, z4.b[1]\n"
+ "udot z16.s, z0.b, z4.b[2]\n"
+ "udot z19.s, z0.b, z4.b[3]\n"
+ "udot z22.s, z0.b, z3.b[0]\n"
+ "udot z25.s, z0.b, z3.b[1]\n"
+ "udot z28.s, z0.b, z3.b[2]\n"
+ "udot z31.s, z0.b, z3.b[3]\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index 58d21d6c40..067d0bf258 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,20 +10,20 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#pragma once
-
#ifdef ARM_COMPUTE_ENABLE_SVE
+
#include "../std_transforms_sve.hpp"
#include "../performance_parameters.hpp"
@@ -55,11 +55,6 @@ public:
return get_vector_length<uint32_t>() * 3;
}
- static unsigned int stripe_width()
- {
- return get_vector_length<uint32_t>();
- }
-
static constexpr unsigned int k_unroll()
{
return 8;
@@ -108,5 +103,4 @@ public:
} // namespace arm_gemm
#undef ARGLIST
-
#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index 0b70d034dd..28449ea99b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_u8u32_mmla_8x3VL(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *Apanel,
+ const uint8_t *Bpanel,
+ uint32_t *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,82 +89,82 @@ void sve_interleaved_u8u32_mmla_8x3VL(
"mov z31.s, #0x0\n"
"blt 4f\n"
"3:" // main loop head
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
- "ld1b { z6.b }, p0/Z, [x22]\n"
+ "ld1b { z7.b }, p0/Z, [x22]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
- ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45c69809 // ummla z9.s, z0.b, z6.b\n"
- ".inst 0x45c7980c // ummla z12.s, z0.b, z7.b\n"
- ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x45c498da // ummla z26.s, z6.b, z4.b\n"
+ ".inst 0x45c598dd // ummla z29.s, z6.b, z5.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
+ ".inst 0x45c3980c // ummla z12.s, z0.b, z3.b\n"
+ ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
+ ".inst 0x45c39832 // ummla z18.s, z1.b, z3.b\n"
"sub x20, x20, #0x2\n"
- ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- ".inst 0x45c79858 // ummla z24.s, z2.b, z7.b\n"
+ ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
+ ".inst 0x45c39858 // ummla z24.s, z2.b, z3.b\n"
"cmp x20, #0x2\n"
- ".inst 0x45c6987b // ummla z27.s, z3.b, z6.b\n"
- ".inst 0x45c7987e // ummla z30.s, z3.b, z7.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
- ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
- ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
+ ".inst 0x45c798db // ummla z27.s, z6.b, z7.b\n"
+ ".inst 0x45c398de // ummla z30.s, z6.b, z3.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
+ ".inst 0x45c5980a // ummla z10.s, z0.b, z5.b\n"
+ ".inst 0x45c4980d // ummla z13.s, z0.b, z4.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
+ ".inst 0x45c59830 // ummla z16.s, z1.b, z5.b\n"
+ ".inst 0x45c49833 // ummla z19.s, z1.b, z4.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
- ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
- ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
+ ".inst 0x45c59856 // ummla z22.s, z2.b, z5.b\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
"ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
- ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n"
- "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
- ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
+ ".inst 0x45c598dc // ummla z28.s, z6.b, z5.b\n"
+ ".inst 0x45c498df // ummla z31.s, z6.b, z4.b\n"
+ "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
+ "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
+ ".inst 0x45c39808 // ummla z8.s, z0.b, z3.b\n"
+ "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
"addvl x22, x22, #16\n"
".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
+ ".inst 0x45c3982e // ummla z14.s, z1.b, z3.b\n"
".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
- ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
- ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #-8, MUL VL]\n"
+ ".inst 0x45c398b4 // ummla z20.s, z5.b, z3.b\n"
+ ".inst 0x45c798b7 // ummla z23.s, z5.b, z7.b\n"
+ ".inst 0x45c398da // ummla z26.s, z6.b, z3.b\n"
+ ".inst 0x45c798dd // ummla z29.s, z6.b, z7.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
"ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
- ".inst 0x45c49809 // ummla z9.s, z0.b, z4.b\n"
- ".inst 0x45c5980c // ummla z12.s, z0.b, z5.b\n"
- ".inst 0x45c4982f // ummla z15.s, z1.b, z4.b\n"
- ".inst 0x45c59832 // ummla z18.s, z1.b, z5.b\n"
- ".inst 0x45c49855 // ummla z21.s, z2.b, z4.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- ".inst 0x45c4987b // ummla z27.s, z3.b, z4.b\n"
- ".inst 0x45c5987e // ummla z30.s, z3.b, z5.b\n"
+ ".inst 0x45c29809 // ummla z9.s, z0.b, z2.b\n"
+ ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
+ ".inst 0x45c2982f // ummla z15.s, z1.b, z2.b\n"
+ ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
+ ".inst 0x45c298b5 // ummla z21.s, z5.b, z2.b\n"
+ ".inst 0x45c498b8 // ummla z24.s, z5.b, z4.b\n"
+ ".inst 0x45c298db // ummla z27.s, z6.b, z2.b\n"
+ ".inst 0x45c498de // ummla z30.s, z6.b, z4.b\n"
"ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
- ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
+ ".inst 0x45c3980a // ummla z10.s, z0.b, z3.b\n"
".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
"ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
+ ".inst 0x45c39830 // ummla z16.s, z1.b, z3.b\n"
".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
"ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
+ ".inst 0x45c398b6 // ummla z22.s, z5.b, z3.b\n"
+ ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n"
"ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
- ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
- ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
+ ".inst 0x45c398dc // ummla z28.s, z6.b, z3.b\n"
+ ".inst 0x45c798df // ummla z31.s, z6.b, z7.b\n"
"ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
"add %x[Apanel], %x[Apanel], #0x80\n"
"addvl x22, x22, #-4\n"
"bge 3b\n"
"4:" // main loop skip
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
@@ -168,114 +172,114 @@ void sve_interleaved_u8u32_mmla_8x3VL(
"ld1b { z6.b }, p0/Z, [x22]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
- ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
+ "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x45c498fa // ummla z26.s, z7.b, z4.b\n"
+ ".inst 0x45c598fd // ummla z29.s, z7.b, z5.b\n"
+ "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+ "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
".inst 0x45c69809 // ummla z9.s, z0.b, z6.b\n"
- ".inst 0x45c7980c // ummla z12.s, z0.b, z7.b\n"
+ ".inst 0x45c3980c // ummla z12.s, z0.b, z3.b\n"
".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n"
- ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n"
+ ".inst 0x45c39832 // ummla z18.s, z1.b, z3.b\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n"
- ".inst 0x45c79858 // ummla z24.s, z2.b, z7.b\n"
+ ".inst 0x45c39858 // ummla z24.s, z2.b, z3.b\n"
"addvl x22, x22, #4\n"
- ".inst 0x45c6987b // ummla z27.s, z3.b, z6.b\n"
- ".inst 0x45c7987e // ummla z30.s, z3.b, z7.b\n"
- ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
- ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
- ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
- ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
- ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
- ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
- ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
- ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
+ ".inst 0x45c698fb // ummla z27.s, z7.b, z6.b\n"
+ ".inst 0x45c398fe // ummla z30.s, z7.b, z3.b\n"
+ ".inst 0x45c5980a // ummla z10.s, z0.b, z5.b\n"
+ ".inst 0x45c4980d // ummla z13.s, z0.b, z4.b\n"
+ ".inst 0x45c59830 // ummla z16.s, z1.b, z5.b\n"
+ ".inst 0x45c49833 // ummla z19.s, z1.b, z4.b\n"
+ ".inst 0x45c59856 // ummla z22.s, z2.b, z5.b\n"
+ ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n"
+ ".inst 0x45c598fc // ummla z28.s, z7.b, z5.b\n"
+ ".inst 0x45c498ff // ummla z31.s, z7.b, z4.b\n"
"cbz x20, 5f\n"
- "ld1b { z6.b }, p0/Z, [x22]\n"
- "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
- ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
- "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
- "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
- "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
- "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
- ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
- ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
- ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
- "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
- ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
- ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
- "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
- ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
- "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
- "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
- ".inst 0x45c49809 // ummla z9.s, z0.b, z4.b\n"
- ".inst 0x45c5980c // ummla z12.s, z0.b, z5.b\n"
+ "ld1b { z1.b }, p0/Z, [x22]\n"
+ "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+ ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n"
+ "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x45c098eb // ummla z11.s, z7.b, z0.b\n"
+ "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
+ ".inst 0x45c198ce // ummla z14.s, z6.b, z1.b\n"
+ ".inst 0x45c098d1 // ummla z17.s, z6.b, z0.b\n"
+ ".inst 0x45c198b4 // ummla z20.s, z5.b, z1.b\n"
+ "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x45c098b7 // ummla z23.s, z5.b, z0.b\n"
+ ".inst 0x45c1989a // ummla z26.s, z4.b, z1.b\n"
+ "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
+ ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n"
+ "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+ "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
+ ".inst 0x45c398e9 // ummla z9.s, z7.b, z3.b\n"
+ ".inst 0x45c298ec // ummla z12.s, z7.b, z2.b\n"
"addvl x22, x22, #6\n"
- ".inst 0x45c4982f // ummla z15.s, z1.b, z4.b\n"
- ".inst 0x45c59832 // ummla z18.s, z1.b, z5.b\n"
+ ".inst 0x45c398cf // ummla z15.s, z6.b, z3.b\n"
+ ".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n"
"add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x45c49855 // ummla z21.s, z2.b, z4.b\n"
- ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n"
- ".inst 0x45c4987b // ummla z27.s, z3.b, z4.b\n"
- ".inst 0x45c5987e // ummla z30.s, z3.b, z5.b\n"
- ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
- ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
- ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
- ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
- ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
- ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
- ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
- ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
+ ".inst 0x45c398b5 // ummla z21.s, z5.b, z3.b\n"
+ ".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n"
+ ".inst 0x45c3989b // ummla z27.s, z4.b, z3.b\n"
+ ".inst 0x45c2989e // ummla z30.s, z4.b, z2.b\n"
+ ".inst 0x45c198ea // ummla z10.s, z7.b, z1.b\n"
+ ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n"
+ ".inst 0x45c198d0 // ummla z16.s, z6.b, z1.b\n"
+ ".inst 0x45c098d3 // ummla z19.s, z6.b, z0.b\n"
+ ".inst 0x45c198b6 // ummla z22.s, z5.b, z1.b\n"
+ ".inst 0x45c098b9 // ummla z25.s, z5.b, z0.b\n"
+ ".inst 0x45c1989c // ummla z28.s, z4.b, z1.b\n"
+ ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n"
"5:" // multiply loop done
- "uzp1 z4.d, z8.d, z11.d\n"
+ "uzp1 z0.d, z8.d, z11.d\n"
"uzp2 z8.d, z8.d, z11.d\n"
- "st1w { z4.s }, p0, [%x[Cpanel]]\n"
- "uzp1 z11.d, z9.d, z12.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+ "uzp1 z0.d, z9.d, z12.d\n"
"uzp2 z9.d, z9.d, z12.d\n"
- "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "uzp1 z12.d, z10.d, z13.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+ "uzp1 z0.d, z10.d, z13.d\n"
"uzp2 z10.d, z10.d, z13.d\n"
- "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
"st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "uzp1 z13.d, z14.d, z17.d\n"
+ "uzp1 z0.d, z14.d, z17.d\n"
"uzp2 z14.d, z14.d, z17.d\n"
"st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
- "uzp1 z17.d, z15.d, z18.d\n"
+ "uzp1 z1.d, z15.d, z18.d\n"
"subs x23, x23, #0x1\n"
"st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"uzp2 z15.d, z15.d, z18.d\n"
- "uzp1 z18.d, z16.d, z19.d\n"
- "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+ "uzp1 z17.d, z16.d, z19.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"uzp2 z16.d, z16.d, z19.d\n"
- "uzp1 z19.d, z20.d, z23.d\n"
- "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+ "uzp1 z0.d, z20.d, z23.d\n"
+ "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
"addvl %x[Cpanel], %x[Cpanel], #16\n"
"uzp2 z20.d, z20.d, z23.d\n"
- "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
"uzp1 z23.d, z21.d, z24.d\n"
"uzp2 z21.d, z21.d, z24.d\n"
"st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
- "uzp1 z24.d, z22.d, z25.d\n"
+ "uzp1 z19.d, z22.d, z25.d\n"
"uzp2 z22.d, z22.d, z25.d\n"
"st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
- "uzp1 z25.d, z26.d, z29.d\n"
+ "uzp1 z18.d, z26.d, z29.d\n"
"uzp2 z26.d, z26.d, z29.d\n"
"st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
- "uzp1 z29.d, z27.d, z30.d\n"
+ "uzp1 z17.d, z27.d, z30.d\n"
"uzp2 z27.d, z27.d, z30.d\n"
- "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
- "uzp1 z30.d, z28.d, z31.d\n"
+ "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+ "uzp1 z16.d, z28.d, z31.d\n"
"uzp2 z28.d, z28.d, z31.d\n"
"st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
- "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+ "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
"st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
"st1w { z21.s }, p0, [%x[Cpanel]]\n"
"st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
- "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
- "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
- "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+ "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+ "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+ "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
"st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
"st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
"st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
@@ -290,4 +294,4 @@ void sve_interleaved_u8u32_mmla_8x3VL(
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
index cf99bbdb46..87310d996d 100644
--- a/src/core/NEON/kernels/arm_gemm/misc.cpp
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018, 2022 Arm Limited.
+ * Copyright (c) 2017-2018, 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,10 +56,14 @@ WeightFormat get_weight_format(const KernelWeightFormat kwf, size_t element_size
wf_i |= 0x10;
}
+#ifdef ARM_COMPUTE_ENABLE_SVE
// Get total bytes in vector output
if (kwf_i & 0x1) {
vector_bytes = vector_count * get_vector_length<uint8_t>();
} else {
+#else
+ if (1) {
+#endif
vector_bytes = vector_count * 16;
}
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index 3f3443025c..31dd65b397 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,7 +42,7 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
unsigned int multi, unsigned int first_col);
template<typename T>
-void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
+void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
size_t M, int32_t *output_ptr, const Requantize32 *qp);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
index 7345793f93..94cd7ddfeb 100644
--- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
@@ -34,7 +34,7 @@ namespace arm_gemm {
template<>
void row_sums_indirect(
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
size_t M, int32_t *out_ptr, const Requantize32 *qp
)
{
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
index ff95507d79..2ab0397fda 100644
--- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
@@ -34,7 +34,7 @@ namespace arm_gemm {
template<>
void row_sums_indirect(
- unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
size_t M, int32_t *out_ptr, const Requantize32 *qp
)
{
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
index ae452e1184..afe24e7ce0 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,9 +67,8 @@ public:
}
template<typename TOut>
- void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool accumulate) {
+ void Merge(TOut *, const TResult *, int, int, int, int, int, const TOut *, const Activation, bool) {
// Separate merge not supported for SME.
- ARM_COMPUTE_UNUSED(out, in, stride, y0, ymax, x0, xmax, bias, act, accumulate);
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
index ef5a01a578..5aa62f0fe4 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -132,7 +132,9 @@ template void Transform<8, 1, true, VLType::None>(float *, const float *, int, i
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int);
#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#ifdef ARM_COMPUTE_ENABLE_BF16
template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int);
+#endif
#endif // AArch32
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
index e6186984e8..8574d89226 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -193,7 +193,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -264,7 +263,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt
"add %x[out], %x[out], #0x80\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -286,4 +284,5 @@ void Transform<32, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
index 6d97f71c7d..cdf1f98608 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -427,4 +427,5 @@ void Transform<12, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
index 96d132b74f..da0809d4d6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -39,7 +39,6 @@ void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t wid
size_t out_stride = 12 * roundup<size_t>(height, 8) * sizeof(uint8_t);
__asm__ __volatile__(
-
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -332,4 +331,5 @@ void Transform<12, 8, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
index 04af6fd713..cef468e9cc 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -236,7 +236,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"mov x20, %x[width]\n"
@@ -319,7 +318,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x30\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -341,4 +339,5 @@ void Transform<12, 2, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
index e6ddc10e04..4c02d0534d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -276,7 +276,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -420,7 +419,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x60\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -442,4 +440,5 @@ void Transform<12, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
index e487d4d839..2a3208d18d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -710,7 +710,6 @@ void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, si
"add %x[out], %x[out], #0x60\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -731,4 +730,5 @@ void Transform<12, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
index 7938325fa4..4d9d5e7f43 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -182,7 +182,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -251,7 +250,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi
"add %x[out], %x[out], #0x18\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
index 4c66fb2c2f..b0cd7e4ef7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -182,7 +182,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -251,7 +250,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t
"add %x[out], %x[out], #0x18\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
index f06c167361..0399f8becc 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -137,4 +137,5 @@ void Transform<4, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
index e0ccb368c2..f3a1dde73f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -327,4 +327,5 @@ void Transform<16, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
index fa45f4fd4d..7c7e91e666 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -39,7 +39,6 @@ void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t wid
size_t out_stride = 16 * roundup<size_t>(height, 8) * sizeof(uint8_t);
__asm__ __volatile__(
-
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -288,4 +287,5 @@ void Transform<16, 8, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
index 06efa9781e..b4515cbfd4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -163,7 +163,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 16f\n"
"8:" // Main loop skip
-
"9:" // Tail row loop: Head
"mov x9, %x[in]\n"
"mov x20, %x[width]\n"
@@ -221,7 +220,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x40\n"
"bge 9b\n"
"16:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -243,4 +241,5 @@ void Transform<16, 2, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
index dafa53eec3..ac67467240 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -320,7 +320,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -486,7 +485,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x80\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -508,4 +506,5 @@ void Transform<16, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
index e012d0920f..b9fe8b126a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -281,7 +281,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si
"bge 1b\n"
"cbz %x[height], 16f\n"
"8:" // Main loop skip
-
"9:" // Tail row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -423,7 +422,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si
"add %x[out], %x[out], #0x80\n"
"bge 9b\n"
"16:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -444,4 +442,5 @@ void Transform<16, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
index 20f9d39f4e..46211ad4e4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -158,7 +158,6 @@ void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -268,4 +267,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
index 22d68acd51..1cb7bc4445 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -762,7 +762,6 @@ void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, si
"add %x[out], %x[out], #0xc0\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -783,4 +782,5 @@ void Transform<24, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
index 799a9cd91d..dcaf69d2a8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -198,7 +198,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -271,7 +270,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t
"add %x[out], %x[out], #0x30\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -292,4 +290,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
index 621c5f99ff..966b75664e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -270,7 +270,6 @@ void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t w
"add %x[out], %x[out], #0x30\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -291,4 +290,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
index 5cd7bd0512..4a22675028 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -503,4 +503,5 @@ void Transform<32, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
index 706d7cd359..237536697c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -280,7 +280,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 24f\n"
"12:" // Main loop skip
-
"13:" // Tail row loop: Head
"mov x25, %x[in]\n"
"mov x20, %x[width]\n"
@@ -427,7 +426,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x80\n"
"bge 13b\n"
"24:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -449,4 +447,5 @@ void Transform<32, 2, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
index b4827525cd..f35752d5a8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -137,7 +137,6 @@ void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -241,4 +240,5 @@ void Transform<6, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
index e1ab14e594..6ef02ac044 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -39,7 +39,6 @@ void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t wid
size_t out_stride = 4 * roundup<size_t>(height, 16) * sizeof(uint8_t);
__asm__ __volatile__(
-
"1:" // Main row loop: Head
"mov x17, %x[in]\n"
"add x16, x17, %x[in_stride]\n"
@@ -316,4 +315,5 @@ void Transform<4, 16, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
index 8adc69e8b3..5667820865 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -333,4 +333,5 @@ void Transform<4, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
index 07602bdc8d..328274a488 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -145,7 +145,6 @@ void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -251,4 +250,5 @@ void Transform<32, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
index a048fbb109..feb469ab0e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -177,7 +177,6 @@ void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -265,4 +264,5 @@ void Transform<24, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
index 01921c5ad9..a4d480c405 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -140,4 +139,5 @@ void Transform<16, 1, true, VLType::SME>(
);
}
-#endif // __ARM_FEATURE_SVE
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
index 6b9b471fdc..552abfc1c6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -178,4 +177,5 @@ void Transform<16, 4, true, VLType::SME>(
);
}
-#endif // __ARM_FEATURE_SVE
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
index 96128cf9c2..9c6f5c83a1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -150,4 +149,5 @@ void Transform<16, 2, true, VLType::SME>(
);
}
-#endif // __ARM_FEATURE_SVE
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
index 080db1c5c1..2756327815 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -197,4 +196,5 @@ void Transform<16, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
index 7e496095f4..a6ddb8fec0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -210,4 +209,5 @@ void Transform<1, 1, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
index 45d3c0729e..399a52e233 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -146,4 +145,5 @@ void Transform<1, 4, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
index 7120d1d33e..6318e29a79 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -206,4 +205,4 @@ void Transform<1, 2, true, VLType::SME>(
);
}
-#endif
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
index 72e7b0c99a..b90063028d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -219,4 +218,5 @@ void Transform<1, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
index a057fd514e..f827197ab7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -222,4 +221,5 @@ void Transform<2, 1, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
index 9eb4075677..c471d66e17 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -146,4 +145,5 @@ void Transform<2, 4, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
index 3fc3920500..5f967fa615 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -208,4 +207,5 @@ void Transform<2, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
index 9d402a2d58..f22b833821 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -236,4 +235,5 @@ void Transform<2, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
index 362bebbea0..14636e3218 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -185,4 +184,5 @@ void Transform<4, 1, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
index cbcc0b4c8b..2d46a481f3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -121,4 +120,5 @@ void Transform<4, 4, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
index 9b28578217..002a12479a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -168,4 +167,5 @@ void Transform<4, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
index 8873070019..2a43f34f71 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -186,4 +185,5 @@ void Transform<4, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
new file mode 100644
index 0000000000..be9ad666a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+ size_t out_stride = 8 * height * sme::get_vector_length<uint8_t>();
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cmp %x[height], #0x2\n"
+ "ptrue p7.b\n"
+ "blt 4f\n"
+ "1:" // Main row loop: Head
+ "mov x25, %x[in]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add %x[in], x24, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "mov x22, %x[width]\n"
+ "2:" // Main row loop: Column loop
+ "mov x21, x22\n"
+ "whilelt p0.h, XZR, x21\n"
+ "ld1h { z31.h }, p0/Z, [x25]\n"
+ "dech x21\n"
+ "whilelt p6.h, XZR, x21\n"
+ "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p5.h, XZR, x21\n"
+ "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p4.h, XZR, x21\n"
+ "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p3.h, XZR, x21\n"
+ "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p2.h, XZR, x21\n"
+ "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p1.h, XZR, x21\n"
+ "ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n"
+ "dech x21\n"
+ "mov x20, x23\n"
+ "ld1h { z24.h }, p0/Z, [x24]\n"
+ "whilelt p0.h, XZR, x21\n"
+ "dech x22, ALL, MUL #8\n"
+ "ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n"
+ "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n"
+ "cmp x22, #0x0\n"
+ "addvl x25, x25, #8\n"
+ "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
+ "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
+ "st1h { z31.h }, p7, [x20]\n"
+ "addvl x24, x24, #8\n"
+ "st1h { z30.h }, p7, [x20, #1, MUL VL]\n"
+ "st1h { z29.h }, p7, [x20, #2, MUL VL]\n"
+ "st1h { z28.h }, p7, [x20, #3, MUL VL]\n"
+ "st1h { z27.h }, p7, [x20, #4, MUL VL]\n"
+ "st1h { z26.h }, p7, [x20, #5, MUL VL]\n"
+ "st1h { z25.h }, p7, [x20, #6, MUL VL]\n"
+ "st1h { z23.h }, p7, [x20, #7, MUL VL]\n"
+ "addvl x20, x20, #16\n"
+ "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n"
+ "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n"
+ "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n"
+ "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n"
+ "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n"
+ "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n"
+ "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n"
+ "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n"
+ "bgt 2b\n"
+ "3:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x2\n"
+ "addvl %x[out], %x[out], #16\n"
+ "bge 1b\n"
+ "cbz %x[height], 8f\n"
+ "4:" // Main loop skip
+ "5:" // Tail row loop: Head
+ "mov x25, %x[in]\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "mov x21, %x[width]\n"
+ "6:" // Tail row loop: Column loop
+ "mov x20, x21\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z23.h }, p0/Z, [x25]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n"
+ "dech x20\n"
+ "dech x21, ALL, MUL #8\n"
+ "whilelt p0.h, XZR, x20\n"
+ "cmp x21, #0x0\n"
+ "ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n"
+ "st1h { z23.h }, p7, [x23]\n"
+ "addvl x25, x25, #8\n"
+ "st1h { z22.h }, p7, [x23, #1, MUL VL]\n"
+ "st1h { z21.h }, p7, [x23, #2, MUL VL]\n"
+ "st1h { z20.h }, p7, [x23, #3, MUL VL]\n"
+ "st1h { z19.h }, p7, [x23, #4, MUL VL]\n"
+ "st1h { z18.h }, p7, [x23, #5, MUL VL]\n"
+ "st1h { z17.h }, p7, [x23, #6, MUL VL]\n"
+ "st1h { z16.h }, p7, [x23, #7, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "bgt 6b\n"
+ "7:" // Tail row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #8\n"
+ "bge 5b\n"
+ "8:" // Done
+ ".inst 0xd503467f // SMSTOP\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+ float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(float) / 2,
+ stride * sizeof(float),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+ bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(bfloat16) / 2,
+ stride * sizeof(bfloat16),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+ __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(__fp16) / 2,
+ stride * sizeof(__fp16),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
new file mode 100644
index 0000000000..45d2e24258
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+ uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+ if (height % 4) {
+ memset(pad_row, 0, width * sizeof(uint8_t));
+ }
+
+ size_t out_stride = 8 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p2.b\n"
+ "1:" // Main row loop: Head
+ "mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x3\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "mov x22, %x[out]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
+ "2:" // Main row loop: Column loop
+ "mov x20, x21\n"
+ "whilelt p1.b, XZR, x20\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
+ "decb x20\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
+ "decw x21, ALL, MUL #8\n"
+ "cmp x21, #0x0\n"
+ "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n"
+ "addvl x26, x26, #2\n"
+ "addvl x25, x25, #2\n"
+ "ld1b { z16.b }, p1/Z, [x24]\n"
+ "zip1 z24.b, z19.b, z16.b\n"
+ "zip2 z20.b, z19.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n"
+ "zip1 z23.b, z17.b, z16.b\n"
+ "zip2 z22.b, z17.b, z16.b\n"
+ "addvl x24, x24, #2\n"
+ "ld1b { z16.b }, p1/Z, [x23]\n"
+ "zip1 z17.b, z18.b, z16.b\n"
+ "zip2 z19.b, z18.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n"
+ "zip1 z18.b, z21.b, z16.b\n"
+ "zip2 z21.b, z21.b, z16.b\n"
+ "addvl x23, x23, #2\n"
+ "zip1 z16.b, z24.b, z17.b\n"
+ "zip2 z17.b, z24.b, z17.b\n"
+ "st1b { z16.b }, p2, [x22]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
+ "zip2 z20.b, z20.b, z19.b\n"
+ "st1b { z17.b }, p2, [x22, #1, MUL VL]\n"
+ "zip1 z19.b, z23.b, z18.b\n"
+ "zip2 z18.b, z23.b, z18.b\n"
+ "st1b { z16.b }, p2, [x22, #2, MUL VL]\n"
+ "zip1 z17.b, z22.b, z21.b\n"
+ "zip2 z16.b, z22.b, z21.b\n"
+ "st1b { z20.b }, p2, [x22, #3, MUL VL]\n"
+ "st1b { z19.b }, p2, [x22, #4, MUL VL]\n"
+ "st1b { z18.b }, p2, [x22, #5, MUL VL]\n"
+ "st1b { z17.b }, p2, [x22, #6, MUL VL]\n"
+ "st1b { z16.b }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "bgt 2b\n"
+ "3:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #8\n"
+ "bge 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+ uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_1x4(
+ reinterpret_cast<uint8_t *>(out),
+ reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(uint8_t) / 1,
+ stride * sizeof(uint8_t),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+ int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_1x4(
+ reinterpret_cast<uint8_t *>(out),
+ reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(int8_t) / 1,
+ stride * sizeof(int8_t),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
new file mode 100644
index 0000000000..ec7c415e27
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+ uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+ if (height % 2) {
+ memset(pad_row, 0, width * sizeof(uint16_t));
+ }
+
+ size_t out_stride = 8 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p4.b\n"
+ "1:" // Main row loop: Head
+ "mov x24, %x[in]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x1\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "mov x21, %x[width]\n"
+ "2:" // Main row loop: Column loop
+ "mov x20, x21\n"
+ "whilelt p3.h, XZR, x20\n"
+ "ld1h { z20.h }, p3/Z, [x24]\n"
+ "dech x20\n"
+ "whilelt p2.h, XZR, x20\n"
+ "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z17.h }, p3/Z, [x23]\n"
+ "decw x21, ALL, MUL #8\n"
+ "cmp x21, #0x0\n"
+ "zip1 z23.h, z20.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ "zip2 z22.h, z20.h, z17.h\n"
+ "zip1 z21.h, z19.h, z16.h\n"
+ "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "zip2 z20.h, z19.h, z16.h\n"
+ "zip1 z19.h, z18.h, z17.h\n"
+ "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "zip2 z18.h, z18.h, z17.h\n"
+ "zip1 z17.h, z24.h, z16.h\n"
+ "zip2 z16.h, z24.h, z16.h\n"
+ "st1h { z23.h }, p4, [x22]\n"
+ "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "bgt 2b\n"
+ "3:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #8\n"
+ "bge 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+ bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_2x2(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(bfloat16) / 2,
+ stride * sizeof(bfloat16),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+ __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_2x2(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(__fp16) / 2,
+ stride * sizeof(__fp16),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
index 847718992a..f627fe575f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -373,4 +372,5 @@ void Transform<12, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
index 74fce4ddf9..b33c4f6c2d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -101,7 +100,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 12f\n"
"6:" // Main loop skip
-
"7:" // Tail row loop: Head
"mov x21, %x[width]\n"
"cntw x20, ALL, MUL #2\n"
@@ -138,7 +136,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt
"addvl %x[out], %x[out], #1\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23"
@@ -160,4 +157,5 @@ void Transform<1, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
index a034be5e74..e468787815 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -305,4 +304,5 @@ void Transform<1, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
index 82d4184061..546800fa69 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -93,7 +92,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 8f\n"
"4:" // Main loop skip
-
"5:" // Tail row loop: Head
"mov x26, %x[in]\n"
"add %x[in], x26, %x[in_stride]\n"
@@ -123,7 +121,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
"addvl %x[out], %x[out], #3\n"
"bge 5b\n"
"8:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
@@ -171,4 +168,5 @@ void Transform<3, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
index ec7095db7b..a44141c109 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -329,7 +328,6 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"addvl %x[out], %x[out], #3\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -364,4 +362,5 @@ void Transform<3, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
index 3d14383a64..36a15a16b3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -292,7 +291,6 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #3\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -314,4 +312,5 @@ void Transform<3, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
index a39235187f..e661e2698a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -103,7 +102,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 8f\n"
"4:" // Main loop skip
-
"5:" // Tail row loop: Head
"mov x26, %x[in]\n"
"add %x[in], x26, %x[in_stride]\n"
@@ -137,7 +135,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"addvl %x[out], %x[out], #4\n"
"bge 5b\n"
"8:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -185,4 +182,5 @@ void Transform<4, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
index e3489398d4..03a78f72f1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -317,4 +316,5 @@ void Transform<4, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
index 9505dc5e6d..b196799cfe 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -248,7 +247,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"bge 1b\n"
"cbz %x[height], 12f\n"
"6:" // Main loop skip
-
"7:" // Tail row loop: Head
"mov x12, %x[in]\n"
"mov x21, %x[width]\n"
@@ -323,7 +321,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #4\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -345,4 +342,5 @@ void Transform<4, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
index 982c0545ed..68fe2d0cbe 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -292,4 +291,5 @@ void Transform<6, 8, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
index 2b5741a49c..910fc6cb02 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -260,7 +259,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"bge 1b\n"
"cbz %x[height], 12f\n"
"6:" // Main loop skip
-
"7:" // Tail row loop: Head
"mov x12, %x[in]\n"
"add x11, x12, %x[in_stride]\n"
@@ -386,7 +384,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #6\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -408,4 +405,5 @@ void Transform<6, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
index 146da33869..f0f10d2f43 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -235,4 +234,5 @@ void Transform<6, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
index f6fc5e8b84..c638eaacde 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -296,7 +295,6 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t
"addvl %x[out], %x[out], #6\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -318,4 +316,5 @@ void Transform<6, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
index 07147acd8e..0526bd0596 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -281,7 +280,6 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt
"addvl %x[out], %x[out], #8\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -303,4 +301,5 @@ void Transform<8, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
index 3ba50fee60..98f0770d77 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -283,4 +282,5 @@ void Transform<8, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
index 6b5ca38ab1..3fa5292143 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -256,4 +255,5 @@ void Transform<8, 8, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
index 237e9b684f..02977ecf1e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -354,7 +353,6 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #8\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -376,4 +374,5 @@ void Transform<8, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
index 51cae7dd5a..34799c60a6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -439,7 +438,6 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #8\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -461,4 +459,5 @@ void Transform<8, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
index 4ad882870e..5a48e579ae 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -279,4 +278,5 @@ void Transform<8, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index a28ddadc68..11b1bd3e05 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,7 +80,8 @@ inline T roundup(const T a, const T b) {
enum class VLType {
None,
SVE,
- SME
+ SME,
+ SME2
};
template<typename T>
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 836402e83d..50290757ec 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "arm_gemm.hpp"
#include <cstddef>
namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transform.hpp b/src/core/NEON/kernels/convolution/winograd/input_transform.hpp
index 113b7ea928..265551288d 100644
--- a/src/core/NEON/kernels/convolution/winograd/input_transform.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/input_transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,7 @@
#pragma once
-#include "arm_compute/core/Error.h"
-
-#include "src/core/NEON/kernels/assembly/winograd.hpp"
+#include "winograd.hpp"
#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
#include <algorithm>
@@ -191,10 +189,9 @@ class TransformDirect : public TransformBase<TIn, TOut>
TOut *outptr, size_t ld_out_matrix,
unsigned int pad_top, unsigned int valid_rows,
unsigned int pad_left, unsigned int valid_cols,
- void *working_space
+ void *
) const override
{
- ARM_COMPUTE_UNUSED(working_space);
const auto end_i = this->get_input_rows() - pad_top;
const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows;
const auto end_j = this->get_input_cols() - pad_left;
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp
index 44f8752a0c..ae589f9772 100644
--- a/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp
@@ -30,12 +30,12 @@ namespace winograd {
namespace input_transform {
void arm_fp32_1x8(
- unsigned int n_channels,
- const float * input_base,
+ const unsigned int n_channels,
+ const float *const input_base,
size_t, // We don't need to stride over rows
- size_t input_col_stride,
+ const size_t input_col_stride,
float *outptr,
- size_t matrix_stride
+ const size_t matrix_stride
)
{
constexpr int inner_tile_cols = 8;
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transform.hpp b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
index 5148495608..971cc99cd2 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/assembly/winograd.hpp"
+#include "winograd.hpp"
#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp
index bbf0ce58b4..cce3745c77 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp
@@ -33,13 +33,13 @@ namespace output_transform {
void arm_fp32_1x2_1x7(
unsigned int n_channels,
const float* inptr,
- size_t matrix_stride,
+ const size_t matrix_stride,
const float* bptr,
float *outptr,
size_t, // No need to stride across rows
- size_t output_col_stride,
- float output_min,
- float output_max
+ const size_t output_col_stride,
+ const float output_min,
+ const float output_max
)
{
constexpr auto inner_tile_cols = 8u, output_tile_cols = 2u;
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp
index feb2a5a2c1..10f25a4aab 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp
@@ -33,13 +33,13 @@ namespace output_transform {
void arm_fp32_1x4_1x5(
unsigned int n_channels,
const float* inptr,
- size_t matrix_stride,
+ const size_t matrix_stride,
const float* bptr,
float *outptr,
size_t, // No need to stride across rows
- size_t output_col_stride,
- float output_min,
- float output_max
+ const size_t output_col_stride,
+ const float output_min,
+ const float output_max
)
{
constexpr auto inner_tile_cols = 8u, output_tile_cols = 4u;
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp
index ffe60e700d..c45ff8cf2c 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp
@@ -34,13 +34,13 @@ namespace output_transform {
void arm_fp32_1x6_1x3(
unsigned int n_channels,
const float* inptr,
- size_t matrix_stride,
+ const size_t matrix_stride,
const float* bptr,
float *outptr,
size_t, // No need to stride across rows
- size_t output_col_stride,
- float output_min,
- float output_max
+ const size_t output_col_stride,
+ const float output_min,
+ const float output_max
)
{
constexpr unsigned int inner_tile_cols = 8, output_tile_cols = 6;
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp
index 54749e6f28..55ed24cd74 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
#include <cstddef>
@@ -888,5 +888,4 @@ void sme_fp32_mopa_4x4_3x3(
} // namespace winograd
} // namespace arm_conv
-#endif //defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
-
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp
index a221aee5d8..0a7030324e 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,9 +30,9 @@ namespace winograd {
namespace output_transform {
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
void sme_fp32_mopa_4x4_3x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
#endif // defined(__aarch64__)
void arm_fp32_4x4_3x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
void arm_fp32_2x2_3x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
@@ -49,9 +49,9 @@ void arm_fp32_1x2_1x7(unsigned int, const float *, size_t, const float *, float
static const TransformImplementation<float> transforms_fp32[] = {
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SME)
{ IMPL(4, 4, 3, 3, sme_fp32_mopa_4x4_3x3, Unpadded), MethodConstraints::RequiresSME },
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
#endif // defined(__aarch64__)
{ IMPL(4, 4, 3, 3, arm_fp32_4x4_3x3, Unpadded), MethodConstraints::LargerShape },
{ IMPL(2, 2, 3, 3, arm_fp32_2x2_3x3, Unpadded) },
diff --git a/src/core/NEON/kernels/convolution/winograd/padding.cpp b/src/core/NEON/kernels/convolution/winograd/padding.cpp
deleted file mode 100644
index aca8448658..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/padding.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <cstring>
-#include <cstdint>
-
-#include "padding.hpp"
-
-namespace padding
-{
-template <typename T>
-void copy_and_pad_tile(
- unsigned int tile_rows,
- unsigned int tile_cols,
- unsigned int n_channels,
- const T *inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- T* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride,
- unsigned int pad_top,
- unsigned int pad_left,
- unsigned int pad_bottom,
- unsigned int pad_right,
- T pad_value
-)
-{
- for (unsigned int out_i = 0; out_i < tile_rows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < tile_cols; out_j++)
- {
- T* const output = outptr + out_i*out_row_stride + out_j*out_col_stride;
-
- if (out_i < pad_top || tile_rows - pad_bottom <= out_i ||
- out_j < pad_left || tile_cols - pad_right <= out_j)
- {
- for (unsigned int n = 0; n < n_channels; n++)
- {
- output[n] = pad_value;
- }
- }
- else
- {
- const auto in_i = out_i - pad_top, in_j = out_j - pad_left;
- const T* const input = inptr + in_i*in_row_stride + in_j*in_col_stride;
- std::memcpy(output, input, n_channels * sizeof(T));
- }
- }
- }
-}
-
-template void copy_and_pad_tile(
- unsigned int, unsigned int, unsigned int,
- const uint8_t *, unsigned int, unsigned int,
- uint8_t *, unsigned int, unsigned int,
- unsigned int, unsigned int, unsigned int, unsigned int, uint8_t
-);
-
-template void copy_and_pad_tile(
- unsigned int, unsigned int, unsigned int,
- const float *, unsigned int, unsigned int,
- float *, unsigned int, unsigned int,
- unsigned int, unsigned int, unsigned int, unsigned int, float
-);
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void copy_and_pad_tile(
- unsigned int, unsigned int, unsigned int,
- const __fp16 *, unsigned int, unsigned int,
- __fp16 *, unsigned int, unsigned int,
- unsigned int, unsigned int, unsigned int, unsigned int, __fp16
-);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <unsigned int TileRows, unsigned int TileCols>
-void CopyCropped<TileRows, TileCols>::execute(
- const size_t size,
- const void * const inptr,
- const size_t in_row_stride,
- const size_t in_col_stride,
- void * const outptr,
- const size_t out_row_stride,
- const size_t out_col_stride,
- const unsigned int pad_top,
- const unsigned int pad_left,
- const unsigned int pad_bottom,
- const unsigned int pad_right
-)
-{
- for (unsigned int out_i = 0, in_i = pad_top; in_i < TileRows - pad_bottom; out_i++, in_i++)
- {
- for (unsigned int out_j = 0, in_j = pad_left; in_j < TileCols - pad_right; out_j++, in_j++)
- {
- std::memcpy(
- static_cast<uint8_t *>(outptr) + out_i*out_row_stride + out_j*out_col_stride,
- static_cast<const uint8_t *>(inptr) + in_i*in_row_stride + in_j*in_col_stride,
- size
- );
- }
- }
-}
-
-template class CopyCropped<2, 2>;
-template class CopyCropped<3, 3>;
-template class CopyCropped<4, 4>;
-
-template <typename T>
-void crop_and_copy_tile(
- unsigned int tile_rows,
- unsigned int tile_cols,
- unsigned int n_channels,
- const T *inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- T *outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride,
- unsigned int crop_top,
- unsigned int crop_left,
- unsigned int crop_bottom,
- unsigned int crop_right
-)
-{
- for (unsigned int out_i = 0, in_i = crop_top; in_i < tile_rows - crop_bottom; out_i++, in_i++)
- {
- for (unsigned int out_j = 0, in_j = crop_left; in_j < tile_cols - crop_right; out_j++, in_j++)
- {
- std::memcpy(
- outptr + out_i*out_row_stride + out_j*out_col_stride,
- inptr + in_i*in_row_stride + in_j*in_col_stride,
- sizeof(T) * n_channels
- );
- }
- }
-}
-
-template void crop_and_copy_tile(
- unsigned int tile_rows,
- unsigned int tile_cols,
- unsigned int n_channels,
- const float *inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- float *outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride,
- unsigned int crop_top,
- unsigned int crop_left,
- unsigned int crop_bottom,
- unsigned int crop_right
-);
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void crop_and_copy_tile(
- unsigned int tile_rows,
- unsigned int tile_cols,
- unsigned int n_channels,
- const __fp16 *inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- __fp16 *outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride,
- unsigned int crop_top,
- unsigned int crop_left,
- unsigned int crop_bottom,
- unsigned int crop_right
-);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace padding
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
index db0f53df1b..5569bc1b89 100644
--- a/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/assembly/winograd.hpp"
+#include "winograd.hpp"
#include <algorithm>
#include <functional>
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
index 510f69baaa..af0dd04298 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/assembly/winograd.hpp"
+#include "winograd.hpp"
#include <memory>
#include <string>
@@ -314,6 +314,7 @@ bool get_implementation(
false, // Indirect input
{}, // No activation
max_threads,
+ false, // Not fixed format
fast_mode,
gemm_cfg
));